In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')
df.head()

Duplicates and missing values

In [None]:
len(df)

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.isnull().sum().sort_values(ascending=False)/len(df)

Encoding and column drop

In [None]:
df.drop(['school', 'classroom', 'student_id'], axis=1, inplace=True)

In [None]:
print('school_setting',str(df['school_setting'].unique()))
print('school_type',str(df['school_type'].unique()))
print('teaching_method',str(df['teaching_method'].unique()))
print('gender',str(df['gender'].unique()))
print('lunch',str(df['lunch'].unique()))

In [None]:
ohe_binaries = OneHotEncoder(drop='if_binary', sparse = False)
ohe_binaries.fit(df[['school_type', 'teaching_method', 'gender', 'lunch']])
df[['school_type', 'teaching_method', 'gender', 'lunch']] = ohe_binaries.transform(df[['school_type', 'teaching_method', 'gender', 'lunch']])

In [None]:
ohe_school_setting = OneHotEncoder(sparse = False)
ohe_school_setting.fit(df[['school_setting']])
school_setting_encoded = ohe_school_setting.transform(df[['school_setting']])
df["Urban"],df["Suburban"],df['Rural'] = school_setting_encoded.T

In [None]:
df.drop('school_setting', axis=1, inplace=True)

In [None]:
corr = df.corr()
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns,
        cmap= "YlGnBu")

In [None]:
corr_df = corr.unstack().reset_index() 
corr_df.columns = ['feature_1','feature_2', 'correlation']
corr_df.sort_values(by="correlation",ascending=False, inplace=True)
corr_df = corr_df[corr_df['feature_1'] != corr_df['feature_2']]
corr_df.head(10)

In [None]:
corr_df.correlation.min()

Base model

In [None]:
X = df.drop('posttest', axis=1)
y = df['posttest']

model = LinearRegression()

scores = cross_val_score(model, X, y)
base_model_score = scores.mean()
base_model_score

In [None]:
permut_model = LinearRegression().fit(X, y)
permutation_score = permutation_importance(permut_model, X, y, n_repeats=10)
importance_df = pd.DataFrame(np.vstack((X.columns, permutation_score.importances_mean)).T)
importance_df.columns=['feature','score decrease']
importance_df.sort_values(by="score decrease", ascending = False)

In [None]:
X = df.drop(['posttest', 'school_type', 'gender'], axis=1)
y = df['posttest']
model2 = LinearRegression()
scores = cross_val_score(model2, X, y)
model_score = scores.mean()
model_score