# Predcit test Scores of Students

## Task :
- Build an efficient ML model that predcits posttest scores with minimal error and high accuracy.

In [None]:
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px
%matplotlib inline 

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize']=(10,6)
matplotlib.rcParams['figure.facecolor']='#00000000'

## Explarotary Analysis and Visualization

In [None]:
scores_df=pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

In [None]:
scores_df

In [None]:
scores_df.info()

In [None]:
scores_df.describe()

## Pretest Score

In [None]:
fig=px.histogram(scores_df,x='pretest',marginal='box',color='teaching_method',title='Distribution of Pretest score')
fig.update_layout(bargap=0.)
fig.show()

We can make the following observations-
- Students whose teaching method is experimental gained more marks in pretest as compared to students of standard teachimg method

## Perform preprocessing into categorical data

In [None]:
scores_df

In [None]:
scores_df.lunch.unique()

In [None]:
lunch_type={'Does not qualify':0,
            'Qualifies for reduced/free lunch':1}
scores_df['lunch_code']=scores_df.lunch.map(lunch_type)

In [None]:
scores_df

In [None]:
teaching_type={'Standard':0,
            'Experimental':1}
scores_df['teaching_code']=scores_df.teaching_method.map(teaching_type)

In [None]:
scores_df

In [None]:
gender_type={'Female':0,
            'Male':1}
scores_df['gender_code']=scores_df.gender.map(gender_type)

In [None]:
scores_df

In [None]:
scores_df.school_type.unique()

In [None]:
school_categories={'Non-public':0,
            'Public':1}
scores_df['school_type_code']=scores_df.school_type.map(school_categories)

In [None]:
scores_df.school_setting.unique()

### Performing one hot encoding into School Setting

In [None]:
from sklearn import preprocessing

In [None]:
enc=preprocessing.OneHotEncoder()
enc.fit(scores_df[['school_setting']])
enc.categories_

In [None]:
one_hot=enc.transform(scores_df[['school_setting']]).toarray()
scores_df[['Rural', 'Suburban', 'Urban']]=one_hot

In [None]:
scores_df

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()



In [None]:
scores_df.corr()

In [None]:
sns.heatmap(scores_df.corr(),cmap='Reds',annot=True)
plt.title('Correaltion Matrix:')


In [None]:
plt.title('Pretest vs. Posttest')
sns.scatterplot(data=scores_df,x='pretest',y='posttest',alpha=0.7,s=15);

In [None]:
sns.regplot(data=scores_df,x='pretest',y='posttest')

In [None]:
targets=scores_df['posttest']

In [None]:
targets

### Loss/Cost function

In [None]:
import numpy as np

In [None]:
def rmse(targets,prediction):
    return np.sqrt(np.mean(np.square(targets - predictions)))


## Start creating model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Creating inputs and targets
inputs=scores_df[['pretest','n_student','teaching_code','school_type_code','Rural','Suburban','Urban']]

#Create the model
model=LinearRegression()

model.fit(inputs,targets)

#model predict
predictions=model.predict(inputs)
#Loss :
loss=rmse(targets,predictions)
print('Loss :',loss)


In [None]:
model.coef_

In [None]:
inputs.columns

In [None]:
weights_df=pd.DataFrame({
    'feature':np.append(inputs.columns,1),
    'weight':np.append(model.coef_,model.intercept_)

})
weights_df.sort_values('weight',ascending=False)

In [None]:
standard_df=scores_df[scores_df.teaching_code==0]

# Creating inputs and targets
inputs=standard_df[['pretest','n_student','school_type_code','Rural','Suburban','Urban']]
targets=standard_df['posttest']
#Create the model
model=LinearRegression()

model.fit(inputs,targets)

#model predict
predictions=model.predict(inputs)
#Loss :
loss=rmse(targets,predictions)
print('Loss :',loss)

In [None]:
weights_df=pd.DataFrame({
    'feature':np.append(inputs.columns,1),
    'weight':np.append(model.coef_,model.intercept_)

})
weights_df.sort_values('weight',ascending=False)

In [None]:
experiment_df=scores_df[scores_df.teaching_code==1]

# Creating inputs and targets
inputs=experiment_df[['pretest','n_student','school_type_code','Rural','Suburban','Urban']]
targets=experiment_df['posttest']
#Create the model
model=LinearRegression()

model.fit(inputs,targets)

#model predict
predictions=model.predict(inputs)
#Loss :
loss=rmse(targets,predictions)
print('Loss :',loss)

In [None]:
weights_df=pd.DataFrame({
    'feature':np.append(inputs.columns,1),
    'weight':np.append(model.coef_,model.intercept_)

})
weights_df.sort_values('weight',ascending=False)

In [None]:
px.histogram(scores_df,x='teaching_method',color='school_type')

In [None]:
scaler.fit(scores_df[['teaching_code']])
scaled_teaching_type_inputs=scaler.transform(scores_df[['teaching_code']])
scores_df['scaled_teaching']=scaled_teaching_type_inputs

In [None]:
scores_df

In [None]:
numeric_cols=['Urban','Suburban','Rural','n_student']
scaler=StandardScaler()
scaler.fit(scores_df[numeric_cols])

In [None]:
scaled_inputs=scaler.transform(scores_df[numeric_cols])
scaled_inputs

In [None]:
other_cols=['scaled_teaching','pretest','school_type_code']
other_data=scores_df[other_cols].values

In [None]:
inputs=np.concatenate((scaled_inputs,other_data),axis=1)
targets=scores_df.posttest

model=LinearRegression().fit(inputs,targets)

predictions=model.predict(inputs)

loss=rmse(targets,predictions)
print('Loss :',loss)

r2_score=model.score(inputs,targets)
print("Accuracy of the model is :",r2_score*100)

In [None]:
weights_df=pd.DataFrame({
    'feature':np.append(numeric_cols+other_cols,1),
    'weight':np.append(model.coef_,model.intercept_)
})
weights_df.sort_values('weight', ascending=False)
