In [1]:
import numpy as np
from pandas import DataFrame
import pandas as pd

# How does the dataset look?

In [2]:
df = pd.read_csv('data/NBA_Stats.txt', sep='\t')
df.shape

(1514, 26)

In [3]:
df.columns = ["GP", "GS", "MIN", "FGM","FGA","FG%","3PM","3PA","3P%","FTM","FTA","FT%","OFF","DEF",
              "TRB","AST","STL","BLK","PF","TOV","PTS","YR","POS","W","H","NAME"]

# Feature names

GP, GS - Games played, games started 

MIN - Minutes played

FGM, FGA, FG% - Field goals MADE, ATEMPTED, PERCENTAGE

3PM, 3PA, 3P% - Three point field goals MADE, ATEMPTED, PERCENTAGE

FTM, FTA, FT% - Free throws MADE, ATEMPTED, PERCENTAGE

OFF, DEF, TRB - Offensive Rebounds, Defensive Rebounds, TOTAL Rebounds

AST - Assists

STL - Steals

BLK - Blocks

PF - Personal Fouls

TOV - Turnovers

PTS - Points scorred

YR - Years playing

W - Weight

H - Height 

POS - Position

In [4]:
columns = {"GP":0, "GS":1, "MIN":2, "FGM":3,"FGA":4,"FG%":5,"3PM":6,"3PA":7,"3P%":8,"FTM":9,"FTA":10,"FT%":11,"OFF":12,"DEF":13,
              "TRB":14,"AST":15,"STL":16,"BLK":17,"PF":18,"TOV":19,"PTS":20,"YR":21,"POS":22,"W":23,"H":24, "NAME":25} 

columns_list = ["GP","GS","MIN","FGM","FGA","FG%","3PM","3PA","3P%","FTM","FTA","FT%","OFF","DEF",
              "TRB","AST","STL","BLK","PF","TOV","PTS","YR","POS","W","H", "NAME"]

# How does the DataFrame look? 

In [5]:
df.ix[:2,:13]

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OFF
0,479,81,22.9,4.08,7.65,0.534,0.01,0.04,0.176,2.72,3.45,0.788,1.98
1,62,0,7.4,0.73,2.26,0.321,0.27,0.97,0.283,0.29,0.37,0.783,0.16
2,514,120,21.1,3.14,7.03,0.447,0.02,0.11,0.182,1.7,2.43,0.7,1.42


In [6]:
df.ix[:2,13:]

Unnamed: 0,DEF,TRB,AST,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
0,3.03,5.01,0.68,0.43,0.41,2.39,1.22,10.89,8,34,112,206,Carl Landry
1,0.66,0.82,0.84,0.32,0.05,0.71,0.39,2.02,2,12,84,193,Gabe Pruitt
2,2.93,4.35,0.89,0.69,0.34,2.41,1.03,8.0,8,4,131,206,Glen Davis


In [7]:
df_preview = df.ix[:2,:23]
df_preview

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,DEF,TRB,AST,STL,BLK,PF,TOV,PTS,YR,POS
0,479,81,22.9,4.08,7.65,0.534,0.01,0.04,0.176,2.72,...,3.03,5.01,0.68,0.43,0.41,2.39,1.22,10.89,8,34
1,62,0,7.4,0.73,2.26,0.321,0.27,0.97,0.283,0.29,...,0.66,0.82,0.84,0.32,0.05,0.71,0.39,2.02,2,12
2,514,120,21.1,3.14,7.03,0.447,0.02,0.11,0.182,1.7,...,2.93,4.35,0.89,0.69,0.34,2.41,1.03,8.0,8,4


# Cleaning the data

In [8]:
df.loc[df['NAME'] == 'Jason Jennings']

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
405,Sr,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,1,5,113,213,Jason Jennings


In [9]:
hm = df.iloc[[405]]
hm

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
405,Sr,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,1,5,113,213,Jason Jennings


In [10]:
value = hm.iloc[0]["GS"]
value

'-'

In [11]:
df.shape

(1514, 26)

In [12]:
for i in columns_list:
    print i

GP
GS
MIN
FGM
FGA
FG%
3PM
3PA
3P%
FTM
FTA
FT%
OFF
DEF
TRB
AST
STL
BLK
PF
TOV
PTS
YR
POS
W
H
NAME


In [13]:

remove = df[df["DEF"] == "-"]
remove_index_list = list(remove.index.values)
#df = df.drop(df.index[remove_index_list])


In [14]:
df = df.drop(df.index[remove_index_list])

In [15]:
df.iloc[405]

GP               162
GS                 5
MIN             11.6
FGM             1.52
FGA             3.27
FG%            0.466
3PM             0.01
3PA             0.03
3P%            0.200
FTM             0.88
FTA             1.33
FT%            0.657
OFF             1.10
DEF             1.85
TRB             2.94
AST             0.25
STL             0.23
BLK             0.36
PF              2.00
TOV             0.67
PTS             3.93
YR                 4
POS                4
W                118
H                203
NAME    Lonny Baxter
Name: 406, dtype: object

In [16]:
df.isnull().any().any()

False

# Output

In [17]:
df.to_csv("data\cleaned.txt",sep="\t", index=False, header=False)

# Data for classification_5_positions (1,2,3,4,5)

In [18]:
df.head(5)

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
0,479,81,22.9,4.08,7.65,0.534,0.01,0.04,0.176,2.72,...,0.43,0.41,2.39,1.22,10.89,8,34,112,206,Carl Landry
1,62,0,7.4,0.73,2.26,0.321,0.27,0.97,0.283,0.29,...,0.32,0.05,0.71,0.39,2.02,2,12,84,193,Gabe Pruitt
2,514,120,21.1,3.14,7.03,0.447,0.02,0.11,0.182,1.7,...,0.69,0.34,2.41,1.03,8.0,8,4,131,206,Glen Davis
3,52,2,8.3,1.31,3.08,0.425,0.0,0.0,0.0,0.5,...,0.17,0.35,1.12,0.42,3.12,2,4,104,208,Jermareo Davidson
4,387,164,19.6,2.17,4.56,0.475,0.45,1.34,0.337,0.75,...,0.53,0.48,1.94,0.89,5.53,8,4,109,208,Josh McRoberts


In [19]:
df5 = df[df['POS']<=5]
df5.head(2)
#df.columns

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
2,514,120,21.1,3.14,7.03,0.447,0.02,0.11,0.182,1.7,...,0.69,0.34,2.41,1.03,8.0,8,4,131,206,Glen Davis
3,52,2,8.3,1.31,3.08,0.425,0.0,0.0,0.0,0.5,...,0.17,0.35,1.12,0.42,3.12,2,4,104,208,Jermareo Davidson


In [20]:
df5.POS.unique()

array([4, 5, 1, 3, 2], dtype=int64)

In [21]:
df5.shape

(712, 26)

# Output 

In [22]:
df5.to_csv('data5/cleaned.txt', sep="\t", index=False, header=False)

# Data for classification_3_positions (1,2,3)

In [23]:
df.head(3)

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
0,479,81,22.9,4.08,7.65,0.534,0.01,0.04,0.176,2.72,...,0.43,0.41,2.39,1.22,10.89,8,34,112,206,Carl Landry
1,62,0,7.4,0.73,2.26,0.321,0.27,0.97,0.283,0.29,...,0.32,0.05,0.71,0.39,2.02,2,12,84,193,Gabe Pruitt
2,514,120,21.1,3.14,7.03,0.447,0.02,0.11,0.182,1.7,...,0.69,0.34,2.41,1.03,8.0,8,4,131,206,Glen Davis


In [24]:
df.POS.unique()

array([34, 12,  4,  5,  1,  3, 45,  2, 23], dtype=int64)

In [25]:
df['POS'] = df['POS'].replace([1,2,12,23,3,34,4,45,5],[1,1,1,0,2,2,2,0,3])
df.head(2)

Unnamed: 0,GP,GS,MIN,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,PF,TOV,PTS,YR,POS,W,H,NAME
0,479,81,22.9,4.08,7.65,0.534,0.01,0.04,0.176,2.72,...,0.43,0.41,2.39,1.22,10.89,8,2,112,206,Carl Landry
1,62,0,7.4,0.73,2.26,0.321,0.27,0.97,0.283,0.29,...,0.32,0.05,0.71,0.39,2.02,2,1,84,193,Gabe Pruitt


In [26]:
df.shape

(1429, 26)

In [27]:
df3 = df[df['POS']>0]
df3.POS.unique()

array([2, 1, 3], dtype=int64)

In [28]:
df3.shape

(1269, 26)

# Output

In [29]:
df3.to_csv('data3/cleaned.txt',sep='\t', head=False,header=False)