In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
import pandas as pd
df = pd.read_csv("Expanded_data_with_more_features.csv")

In [None]:
df.info

In [None]:
df.describe()

In [None]:
df.dtypes

# Data Cleaning

In [None]:
df.isna().sum()

## Drop Columns

In [None]:
df = df.drop("Unnamed: 0", axis=1)

## Drop NaNs

In [None]:
cols_to_drop = ["EthnicGroup","ParentEduc","TestPrep","ParentMaritalStatus","PracticeSport", "IsFirstChild","NrSiblings","TransportMeans","WklyStudyHours"]
df = df.dropna(subset=cols_to_drop, axis=0)

In [None]:
df.isna().sum()

In [None]:
df.head()

# Type of data

Data - Categorical - Ordinal = "ParentEduc", "TestPrep", "PracticeSport"
Data - Categorical - Nominal = "Gender", "EthnicGroup", "LunchType", "ParentMaritalStat", "PracticeSport", "IsFirstChild", "TransportMeans"] 
Data - Numerical - Discrete = "WklyStudyHours", "NrSiblings"
Data - Numerical - Continuous = "MathScore","ReadingScore","WritingScore "

## Transformation of Categorical data 

### Ordinal data - Label incoding
<p>"ParentEduc"</p> 
<p>"TestPrep"</p>
<p>"PracticeSport"</p>

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#### ParentEduc

In [None]:
df["ParentEduc"].unique()

In [None]:
le.fit(["some high school","high school" ,"some college", "associate's degree", "bachelor's degree", "master's degree"])
df["ParentEduc"] = le.transform(df["ParentEduc"])

#### TestPrep

In [None]:
df["TestPrep"].unique()

In [None]:
le.fit(['none', 'completed'])
df["TestPrep"] = le.transform(df["TestPrep"])

#### PracticeSport

In [None]:
df["PracticeSport"].unique()

In [None]:
le.fit(["never", "sometimes", "regularly"])
df["PracticeSport"] = le.transform(df["PracticeSport"])

#### WklyStudyHours

In [None]:
df["WklyStudyHours"].unique()

In [None]:
le.fit(['< 5', '5 - 10', '> 10'])
df["WklyStudyHours"] = le.transform(df["WklyStudyHours"])

### Nominal - Getdummies

<p>"Gender"</p>
<p>"EthnicGroup" </p>
<p>"LunchType"</p>
<p>"ParentMaritalStat"</p> 
<p>"PracticeSport"</p> 
<p>"IsFirstChild"</p>
<p>"TransportMeans"</p>

In [None]:
df = pd.get_dummies(df, columns=["Gender", "EthnicGroup", "LunchType", "ParentMaritalStatus", "PracticeSport", "IsFirstChild", "TransportMeans"])

### Mapa de correlaciones (Hitmap) sin eliminar categorias

In [None]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True # trick to filter out the upper-right triangle, which is redundant due to symmetry
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(16, 14))
    ax = sns.heatmap(corr, mask=mask,cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

### Define base categories 

In [None]:
df = df.drop(['Gender_male', 'EthnicGroup_group A', 'LunchType_standard', 'ParentMaritalStatus_married', 'PracticeSport_0', 'IsFirstChild_yes','TransportMeans_school_bus'] ,axis=1)

### Delete features with high correlation

In [None]:
df = df.drop(['WritingScore'],axis=1)

In [None]:
corr = df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True # trick to filter out the upper-right triangle, which is redundant due to symmetry
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(16, 14))
    ax = sns.heatmap(corr, mask=mask,cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

# KNN Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_rows', 200)

In [None]:
# Separating the data into independent and dependent variables
# Converting each dataframe into a numpy array
# since each dataframe contains only one column

RAND_STATE = 34 # for reproducible shuffling
TT_RATIO = 0.3 # test/train

dfX = df.drop('MathScore',axis=1)

X=dfX
y=df['MathScore'] 

# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)


## Model Development

### Optimal K

In [None]:
col1 = []
col2 = []
for k in range(1,21):
    col1.append(k)

for k in col1:
    model = KNeighborsRegressor(k)
    model.fit(X_train, y_train)
    r = model.score(X_test, y_test) 
    col2.append(r)

In [None]:
KR = pd.DataFrame({'K':col1, 'R':col2})
KR

In [None]:
import pandas as pd
max_index = KR['R'].idxmax()
key_max_number = KR.loc[max_index, 'K']
print('The optimal K is', key_max_number)

In [None]:
model = KNeighborsRegressor(key_max_number)
model.fit(X_train, y_train)

###  R-squared

In [None]:
model.score(X_test, y_test)

## Model Validation

In [None]:
y_pred = model.predict(X_test)
y_pred_train= model.predict(X_train)
print(mean_squared_error(y_test,y_pred))

In [None]:
plt.scatter(y_pred, y_test, color = 'b')
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def performance_model(y_train, y_test, y_pred_train, y_pred_test):

    # Model validation y_train - y_pred_train
    ME_train = np.mean(y_train-y_pred_train)
    ME_test  = np.mean(y_test-y_pred_test)

    MAE_train = mean_absolute_error(y_train,y_pred_train)
    MAE_test  = mean_absolute_error(y_test,y_pred_test)

    MSE_train = mean_squared_error(y_train,y_pred_train)
    MSE_test  = mean_squared_error(y_test,y_pred_test)

    RMSE_train = np.sqrt(MSE_train)
    RMSE_test  = np.sqrt(MSE_test)

    MAPE_train = np.mean((np.abs(y_train-y_pred_train) / y_train)* 100.)
    MAPE_test  = np.mean((np.abs(y_test-y_pred_test) / y_test)* 100.)

    R2_train = r2_score(y_train,y_pred_train)
    R2_test  = r2_score(y_test,y_pred_test)

    performance = pd.DataFrame({'Error_metric': ['Mean error','Mean absolute error','Mean squared error',
                                             'Root mean squared error','Mean absolute percentual error',
                                             'R2'],
                            'Train': [ME_train, MAE_train, MSE_train, RMSE_train, MAPE_train, R2_train],
                            'Test' : [ME_test, MAE_test , MSE_test, RMSE_test, MAPE_test, R2_test]})

    pd.options.display.float_format = '{:.2f}'.format


    df_train = pd.DataFrame({'Real_value': y_train, 'Predicted_value': y_pred_train})
    df_test  = pd.DataFrame({'Real_value': y_test,  'Predicted_value': y_pred_test})

    return performance, df_train, df_test

In [None]:
performance, _ ,_ = performance_model(y_train, y_test, y_pred_train, y_pred)
performance

In [None]:
feature_names= list(X.columns)
importances = model.feature_importances_
features_importances = pd.Series(importances, index=feature_names)
features_importances