In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
df.head()

Build a model for predicting posttest scores

# Data Exploration

In [None]:
df.info()

In [None]:
df.drop('student_id',axis=1,inplace=True)

In [None]:
df.describe()

In [None]:
df.school.value_counts()

In [None]:
df.school_setting.value_counts()

In [None]:
df.school_type.value_counts()

In [None]:
pd.crosstab(df.school_type,df.teaching_method).plot(kind='bar')

Public school are ahead of non public for teaching method to be experimental

In [None]:
df.classroom.value_counts()

In [None]:
df.drop('classroom',axis=1,inplace=True)

In [None]:
df.teaching_method.value_counts()

In [None]:
pd.crosstab(df.school_setting,df.teaching_method).plot(kind='bar')
plt.show()

Experimental method of teaching is much less than standard teaching method in any type of school setting

In [None]:
df.lunch.value_counts()

In [None]:
pd.crosstab(df.school_setting,df.lunch).plot(kind='bar',figsize=(10,7))
plt.show()

Free lunch is available for more students in the urban setting

In [None]:
pd.crosstab(df.school_setting,df.gender).plot(kind='bar',figsize=(9,6))

Gender diversity is almost similar in different settings

# Dimensionality Reduction

In [None]:
sch_cnt=df.school.value_counts(ascending=False)
sch_cnt

In [None]:
sch_below_100=sch_cnt[sch_cnt<100]
sch_below_100

In [None]:
df.school=df.school.apply(lambda x:'other' if x in sch_below_100 else x)
df.school.value_counts()

Schools having count less than 100 have been labeled as 'other'

# Encoding Categorical Variables

In [None]:
df_enc=pd.get_dummies(df,drop_first=True)
df_enc

# Outlier Detection and Removal

In [None]:
sns.boxplot(df.n_student)

In [None]:
sns.boxplot(df.pretest)

In [None]:
sns.boxplot(df.posttest)

There are no outliers in the dataset

# Splitting Dependent and Independent Variables

In [None]:
x=df_enc.drop('posttest',axis=1)
x.head()

In [None]:
y=df_enc['posttest']
y

# Feature Scaling

In [None]:
# Using standard scalar
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_s=sc.fit_transform(x)
x_s

In [None]:
plt.scatter(x.iloc[:,0],y)

In [None]:
plt.scatter(x.iloc[:,1],y)

# Deploying Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
score1=cross_val_score(LinearRegression(),x,y,cv=3)
np.average(score1)

In [None]:
score2=cross_val_score(SVR(),x,y,cv=3)
np.average(score2)

In [None]:
score3=cross_val_score(RandomForestRegressor(),x,y,cv=3)
np.average(score3)

In [None]:
score4=cross_val_score(DecisionTreeRegressor(),x,y,cv=3)
np.average(score4)

In [None]:
accd={
    'LinearRegression':[0.9190958088479935],
    'SVR':[0.8593869829595263],
    'RandomForest':[0.8733285421438447],
    'DecisionTree':[0.8222976671036698]
}

In [None]:
acc=pd.DataFrame(data=accd)
acc