# This Template is created to make grading fair and straightforward. Anything not in the place as mentioned in the template would not be graded.

<font color='red'> # NOTE: We would run the notebook through a Plagiarism Checker. If it is found to be copied, your work would not be graded, and the incident would be highlighted to NYU Authorities. </font>

# Import Library and Dataset

In [4]:
import pandas as pd
import sys
import numpy as np
from sklearn.preprocessing import StandardScaler
#import csv file from the command line as dataframe
df=pd.read_csv(sys.argv[1])

# PART I: Preprocessing

#### Handling missing values. (If ANY)

In [None]:
#drop weight, finbourgh_flick, double_eight_loop due to missing information and drop player_id due to not related to the target
df.drop(["weight","finbourgh_flick", "double_eight_loop","player_id"], axis=1,inplace=True)

#handling missing values by creating another category named 'U'
columns_replace=["house","player_code","move_specialty"]
for column in columns_replace:
	df[column].replace("?","U",inplace=True)
df["gender"].replace("Unknown/Invalid","U",inplace=True)

#drop category 'U' from gender
#only very few of rows have unknown type
df = df[df.gender != 'U']


#### Feature Datatype Conversion From Numeric to categoric and Vice-versa. (If ANY)

In [None]:
#define function for encoding (mapping)
def map_features(features,df,dict):
	for i in features:
		df = df.replace({i:dict})

	return df

#reducing nominal values in snitchnip and stooging
foul_dict={'None':'none','Norm':'norm','>7':'high','>8':'high','>200':'high','>300':'high'}
foul_columns=["snitchnip","stooging"]
df=map_features(foul_columns,df,foul_dict)

#generate move specialty dict for reducing nominal values
#1 stands for with specialty, 0 stands for without specialty
def convert_move_specialty(df):
	dict={}
	for i in df["move_specialty"]:
		if i=="U":
			dict.update({"U":0})
		else:
			dict.update({i:1})
	return dict

move_spec_dict=convert_move_specialty(df)
df=map_features(["move_specialty"],df,move_spec_dict)

#23 tactics feature, ready for conversion
tactics_columns=["body_blow","checking","dopplebeater_defence","hawkshead_attacking_formation","no_hands_tackle","power_play","sloth_grip_roll","spiral_dive","starfish_and_stick","twirl","wronski_feint","zig-zag","bludger_backbeat","chelmondiston_charge","dionysus_dive","reverse_pass","parkins_pincer","plumpton_pass","porskoff_ploy","transylvanian_tackle","woollongong_shimmy"]

#make a copy of dataframe for future use(feature reduction and extraction) before encoding
df_tactics_change=df.copy()

#convert tactics
#Steady, Up, Down to 1, No to 0
tactics_dict={'Steady':1,'No':0,'Up':1,'Down':1}
df=map_features(tactics_columns,df,tactics_dict)

#convert gender
#Female to 0, Male to 1

ordered_satisfaction = ["Female","Male"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["gender"]=df["gender"].astype(cat_dtype).cat.codes

#convert snitch_caught
#No to 0, Yes to 1

ordered_satisfaction = ["No","Yes"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["snitch_caught"]=df["snitch_caught"].astype(cat_dtype).cat.codes

#convert change
#No to 0,Ch to 1

ordered_satisfaction = ["No","Ch"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["change"]=df["change"].astype(cat_dtype).cat.codes

#covert target
#NO to 0, YES to 1
#ignore this part when transforming test data

ordered_satisfaction = ["NO","YES"]
cat_dtype = pd.api.types.CategoricalDtype(ordered_satisfaction, ordered=True)
df["quidditch_league_player"]=df["quidditch_league_player"].astype(cat_dtype).cat.codes

#one-hot encoding rest of columns

df=pd.get_dummies(df, columns=["house","foul_type_id","game_move_id","penalty_id","player_code","player_type","snitchnip","stooging"])



#### Feature Reduction or extraction. (If ANY)

In [None]:
#sum num_games_satout, num_games_injured, num_games_notpartof and combine them into one feature named num_game_not_participate

df["num_game_not_participate"]=df.num_games_satout+df.num_games_injured+df.num_games_notpartof

#sum up number of tactic changes into one feature named num_tactics_change

#encoding dictionary for helping calculation
#Up and Down count for change
tactics_change_dict={'Steady':0,'No':0,'Up':1,'Down':1}

#do encoding in the copy of dataframe, help calculation
df_tactics_change=map_features(tactics_columns,df_tactics_change,tactics_change_dict)

#initialize column filled by 0
df["num_tactics_change"]=0

#define function for sum change of tactics
def sum_change_tactics(df,df_copy,columns):

	for i in columns:

		df["num_tactics_change"]+=df_copy[i]

sum_change_tactics(df,df_tactics_change,tactics_columns)


#sum up number of tactics used by each player
#create new column named num_total_tactics

df["num_total_tactics"]=0
def sum_tactics(df,columns):

	for i in columns:
		
		df["num_total_tactics"]+=df[i]

	return df

sum_tactics(df,tactics_columns)

#move target to the last column
#ignore this part when transforming test data
df_target=df["quidditch_league_player"]
df.drop(["quidditch_league_player"], axis=1,inplace=True)
df.insert(len(df.columns),"quidditch_league_player", df_target)



#### Any other Pre-processing Used. (Give the name along with the code.)

In [2]:
#log transform

log_transform_columns=["num_games_satout","num_games_injured","num_games_notpartof"]
def log_transform(df,columns):

	for i in columns:
		#add 1 to original values to perform log transform
		df[i]+=1
		df[i]=df[i].apply(np.log)

log_transform(df,log_transform_columns)

#Standardization (v-mean)/std

numeric_columns=["game_duration","num_game_moves","num_game_losses","num_practice_sessions","num_games_satout","num_games_injured","num_games_notpartof","num_games_won","age","num_total_tactics","num_game_not_participate","num_tactics_change"]
def standardize_numeric_value(df,columns):
	scaler = StandardScaler()
	for i in columns:

		df[i]=scaler.fit_transform(df[i].values.reshape(-1,1))

standardize_numeric_value(df,numeric_columns)

#remove outliers
def remove_outliers(df,columns):

	for i in columns:
		
		df = df[np.abs(df[i] - df[i].mean()) <= (3 * df[i].std())]
		
remove_outliers(df,numeric_columns)

#generate correlation matrix to observe
df_corr=df.corr()
df_corr.to_csv("correlation.csv")

df.to_csv("data_aftercleaned.csv",index=False)



NameError: name 'df' is not defined

# PART II: Classification

### Model 1:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [5]:
#Code...

### Model 2:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...

### Model 3:
Model Name:-----------<br>
Evaluation method and metric used Name:-----------<br>
Name of the Hyperparameter used:--------------......<br>


In [None]:
#Code...

# PART III: Best Hypothesis:
Model Name:------------<br>
Reason:--------------<br>
Hyper-parameter Value:-----------<br>
