# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [1]:
import numpy as np
import pandas as pd
import os 
import sklearn
import matplotlib.pyplot as plt
from sklearn import linear_model, neighbors, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold, cross_val_score
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler
from sklearn.preprocessing import LabelBinarizer
 
#read data
df = pd.read_csv('qudditch_training.csv')
y_label = df['quidditch_league_player']
df = df.drop('quidditch_league_player',axis=1)

Using TensorFlow backend.


# PART I: Preprocessing

#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)
#### Feature Reduction or extraction. (If ANY)

In [2]:
#replace gender with numeric
df.replace('Female',1,inplace=True)
df.replace('Male',0,inplace=True)
df['weight'].replace({'>200': 200, '[0-25)': 0,'[25-50)': 25,'[50-75)': 50,'[75-100)': 75,'[100-125)': 100,'[125-150)': 120,'[150-175)': 150,'[175-200)': 175,'?':np.NaN},inplace=True)
df = df[df.gender != 'Unknown/Invalid']
df = df[df.house != '?']



# Finding columns with no information
useless = []
for i in df.columns:
    l = len(df[i].unique())
    if l<2:
        useless.append(i)
        print("Column Name: ",i,"Uniques: ",l)
df.drop(useless,axis=1,inplace=True)  

Column Name:  double_eight_loop Uniques:  1
Column Name:  finbourgh_flick Uniques:  1
Column Name:  transylvanian_tackle Uniques:  1


In [3]:
#change all feature that only has two items into 0,1 which all no indicated by 0 and other indicated by 1 
bi = []
for i in df.columns.drop('gender'):
    l = len(df[i].unique())
    if l == 2:
        bi.append(i)
        print("Column Name: ",i,"Uniques: ",l)
        print(df[i].unique())
        
for i in bi:
    df[i].replace(df[i].unique()[0],0,inplace=True)
    df[i].replace(df[i].unique()[1],1,inplace=True)        

Column Name:  power_play Uniques:  2
['No' 'Steady']
Column Name:  starfish_and_stick Uniques:  2
['No' 'Steady']
Column Name:  chelmondiston_charge Uniques:  2
['No' 'Steady']
Column Name:  plumpton_pass Uniques:  2
['No' 'Steady']
Column Name:  porskoff_ploy Uniques:  2
['No' 'Steady']
Column Name:  woollongong_shimmy Uniques:  2
['No' 'Steady']
Column Name:  change Uniques:  2
['No' 'Ch']
Column Name:  snitch_caught Uniques:  2
['No' 'Yes']


In [4]:
# makesure all columns have more than 2 value
useless = []
for i in df.columns:
    l = len(df[i].unique())
    if l<2:
        useless.append(i)
        print("Column Name: ",i,"Uniques: ",l)
df.drop(useless,axis=1,inplace=True)  

In [5]:
#dummie part
cat = []
for i in df.columns:
    l = len(df[i].unique())
    if l > 2:
        cat.append(i)
        print("Column Name: ",i,"Uniques: ",l)
#         print(df[i].unique())
cat.remove('player_id')
cat.remove('weight')
cat.remove('player_code')
cat.remove('move_specialty')

dummie = pd.get_dummies(df[cat])
dummie = pd.concat([dummie,df[bi]],axis=1)
dummie = pd.concat([dummie,df['gender']],axis=1)


Column Name:  id_num Uniques:  99002
Column Name:  player_id Uniques:  69405
Column Name:  house Uniques:  5
Column Name:  age Uniques:  10
Column Name:  weight Uniques:  10
Column Name:  foul_type_id Uniques:  8
Column Name:  game_move_id Uniques:  26
Column Name:  penalty_id Uniques:  17
Column Name:  game_duration Uniques:  14
Column Name:  player_code Uniques:  17
Column Name:  move_specialty Uniques:  73
Column Name:  num_game_moves Uniques:  118
Column Name:  num_game_losses Uniques:  7
Column Name:  num_practice_sessions Uniques:  75
Column Name:  num_games_satout Uniques:  39
Column Name:  num_games_injured Uniques:  32
Column Name:  num_games_notpartof Uniques:  21
Column Name:  player_type Uniques:  9
Column Name:  num_games_won Uniques:  16
Column Name:  snitchnip Uniques:  4
Column Name:  stooging Uniques:  4
Column Name:  body_blow Uniques:  4
Column Name:  checking Uniques:  4
Column Name:  dopplebeater_defence Uniques:  4
Column Name:  hawkshead_attacking_formation Uniqu

In [6]:
dummie.columns

Index(['id_num', 'age', 'foul_type_id', 'game_move_id', 'penalty_id',
       'game_duration', 'num_game_moves', 'num_game_losses',
       'num_practice_sessions', 'num_games_satout', 'num_games_injured',
       'num_games_notpartof', 'num_games_won', 'house_Gryffindor',
       'house_Hufflepuff', 'house_Other', 'house_Ravenclaw', 'house_Slytherin',
       'player_type_Beater1', 'player_type_Beater2', 'player_type_Captain',
       'player_type_Chaser1', 'player_type_Chaser2', 'player_type_Chaser3',
       'player_type_Keeper', 'player_type_Multiple', 'player_type_Seeker',
       'snitchnip_>200', 'snitchnip_>300', 'snitchnip_None', 'snitchnip_Norm',
       'stooging_>7', 'stooging_>8', 'stooging_None', 'stooging_Norm',
       'body_blow_Down', 'body_blow_No', 'body_blow_Steady', 'body_blow_Up',
       'checking_Down', 'checking_No', 'checking_Steady', 'checking_Up',
       'dopplebeater_defence_Down', 'dopplebeater_defence_No',
       'dopplebeater_defence_Steady', 'dopplebeater_defence

#### Handling missing values. (If ANY)

In [7]:
#get single colunm
wei = df['weight']
player = df['player_code']
spc = df['move_specialty']

temp_weight = pd.concat([dummie,wei],axis=1)
temp_play = pd.concat([dummie,player],axis=1)
temp_play.replace('?',np.NaN,inplace=True)
temp_spc = pd.concat([dummie,spc],axis=1)
temp_spc.replace('?',np.NaN,inplace=True)

In [8]:
# split data for weight
weight_null = temp_weight.loc[temp_weight['weight'].isnull()]
weight_no_null = temp_weight.dropna()
wei_train_x = weight_no_null.iloc[:,:-1]
wei_train_y = weight_no_null.iloc[:,-1:]
wei_test_x = weight_null.iloc[:,:-1]

In [9]:
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(wei_train_x, wei_train_y)
# Make predictions using the testing set
wei_test_y = regr.predict(wei_test_x)

  linalg.lstsq(X, y)


In [10]:
#concat the result to original dataset 
df_test_y = pd.DataFrame(wei_test_y)
df_test_y.reset_index(drop=True, inplace=True)
wei_test_x.reset_index(drop=True, inplace=True)
result = pd.concat([wei_test_x,df_test_y],axis=1)
result.rename(index=str, columns={0: "weight"},inplace=True)
result=result.append(weight_no_null)

In [11]:
#add player_code colunm
id_ply = df[['id_num','player_code']]
result = result.join(id_ply.set_index('id_num'),on='id_num')
result.sort_values(by='id_num')
result.set_index('id_num',inplace=True)

In [12]:
#split set by player_code
result['player_code'].replace('?',np.NaN,inplace=True)
code_null = result.loc[result['player_code'].isnull()]
code_no_null = result.dropna()
code_train_x = code_no_null.iloc[:,:-1]
code_train_y = code_no_null.iloc[:,-1:]
code_test_x = code_null.iloc[:,:-1]

In [13]:
#LabelEncoder to mover_specialty
le = preprocessing.LabelEncoder()
le.fit(code_train_y['player_code'])
trans=le.transform(code_train_y['player_code']) 

In [14]:
#knn prediction
n_neighbors = 15
ply_knn = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
ply_knn.fit(code_train_x, trans)
ply_pre_knn=ply_knn.predict(code_test_x)

In [15]:
#add predict row to original one
pre=np.around(ply_pre_knn)
pre = pre.astype('int64')
pre_y=le.inverse_transform(pre)
code_test_x['player_code'] = pre_y
result=code_no_null.append(code_test_x)
result = result.sort_values(by='id_num')

In [16]:
#data split with move_specialty 
id_spc = df['move_specialty']
result = result.join(id_spc,on='id_num')
result['move_specialty'].replace('?',np.NaN,inplace=True)
spc_null = result.loc[result['move_specialty'].isnull()]
spc_no_null = result.dropna()
spc_train_x = spc_no_null.iloc[:,:-2]
spc_train_y = spc_no_null.iloc[:,-1:]
spc_test_x = spc_null.iloc[:,:-2]


In [17]:
#LabelEncoder to mover_specialty
le1 = preprocessing.LabelEncoder()
le1.fit(spc_train_y['move_specialty'])
trans_spc=le1.transform(spc_train_y['move_specialty'])


In [18]:
#knn prediction
n_neighbors = 15
clf_knn = neighbors.KNeighborsClassifier(n_neighbors, weights="distance")
clf_knn.fit(spc_train_x, trans_spc)
pre_knn=clf_knn.predict(spc_test_x)
pre_knn_in=le1.inverse_transform(pre_knn)


In [25]:
#add predict row to original one
spc_test_x['move_specialty'] = pre_knn_in
spc_test_x['player_code'] = spc_null['player_code']
result=spc_no_null.append(spc_test_x,sort=True)
result.sort_values(by='id_num')


Unnamed: 0_level_0,age,bludger_backbeat_Down,bludger_backbeat_No,bludger_backbeat_Steady,bludger_backbeat_Up,body_blow_Down,body_blow_No,body_blow_Steady,body_blow_Up,change,...,weight,woollongong_shimmy,wronski_feint_Down,wronski_feint_No,wronski_feint_Steady,wronski_feint_Up,zig-zag_Down,zig-zag_No,zig-zag_Steady,zig-zag_Up
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,11.0,0,1,0,0,0,1,0,0,0,...,64.542387,0,0,1,0,0,0,1,0,0
2,12.0,0,1,0,0,0,1,0,0,1,...,92.431557,0,0,1,0,0,0,1,0,0
3,13.0,0,1,0,0,0,1,0,0,0,...,80.993659,0,0,1,0,0,0,1,0,0
4,14.0,0,1,0,0,0,1,0,0,1,...,86.503747,0,0,1,0,0,0,1,0,0
5,14.5,0,1,0,0,0,1,0,0,1,...,74.277106,0,0,1,0,0,0,1,0,0
6,15.0,0,1,0,0,0,1,0,0,0,...,82.395936,0,0,1,0,0,0,1,0,0
7,15.5,0,1,0,0,0,0,1,0,1,...,83.391879,0,0,1,0,0,0,1,0,0
8,16.0,0,1,0,0,0,1,0,0,0,...,70.397566,0,0,1,0,0,0,1,0,0
9,16.5,0,1,0,0,0,1,0,0,1,...,59.271119,0,0,1,0,0,0,1,0,0
10,17.0,0,1,0,0,0,1,0,0,1,...,60.354800,0,0,0,1,0,0,1,0,0


#### Any other Pre-processing Used. (Give the name along with the code.)

In [20]:
#dummie again
ply_and_spc = ['player_code','move_specialty']
dummie_ag = pd.get_dummies(result[ply_and_spc])
result = pd.concat([result,dummie_ag],axis=1)
result=result.sort_index()

In [21]:
#the final dataset is a 99002 rows × 190 columns and called result
redun = ['player_code','move_specialty']
result.drop(redun,axis=1)

Unnamed: 0_level_0,age,bludger_backbeat_Down,bludger_backbeat_No,bludger_backbeat_Steady,bludger_backbeat_Up,body_blow_Down,body_blow_No,body_blow_Steady,body_blow_Up,change,...,move_specialty_specialty66,move_specialty_specialty67,move_specialty_specialty68,move_specialty_specialty69,move_specialty_specialty7,move_specialty_specialty70,move_specialty_specialty71,move_specialty_specialty72,move_specialty_specialty8,move_specialty_specialty9
id_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,11.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12.0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,13.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14.0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,14.5,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,15.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,15.5,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
8,16.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,16.5,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10,17.0,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


0          NO
1          NO
2          NO
3          NO
4          NO
5          NO
6          NO
7          NO
8          NO
9          NO
10         NO
11        YES
12        YES
13         NO
14         NO
15         NO
16        YES
17         NO
18         NO
19         NO
20         NO
21         NO
22         NO
23         NO
24         NO
25         NO
26         NO
27         NO
28         NO
29         NO
         ... 
101236     NO
101237     NO
101238     NO
101239     NO
101240     NO
101241     NO
101242     NO
101243     NO
101244     NO
101245     NO
101246    YES
101247     NO
101248     NO
101249     NO
101250    YES
101251     NO
101252     NO
101253     NO
101254     NO
101255     NO
101256     NO
101257     NO
101258     NO
101259     NO
101260     NO
101261     NO
101262     NO
101263     NO
101264     NO
101265     NO
Name: quidditch_league_player, Length: 101266, dtype: object

# PART II: Classification

### Model 1:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [22]:
#Code...

### Model 2:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [23]:
#Code...

### Model 3:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [24]:
#Code...

# PART III: Best Hypothesis:
Model Name:------------<br>
Reason:--------------<br>
Hyper-parameter Value:-----------<br>
