In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

%matplotlib inline

In [None]:
df = pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df.head()

## **Dropping school, student_id**

In [None]:
df.drop(['school','student_id'], axis=1, inplace=True)

In [None]:
df['school_setting'].unique()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df['school_setting'].value_counts().plot(kind='barh')
plt.title('school setting')
plt.gca().invert_yaxis()
plt.show()

In [None]:
df['school_type'].unique()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df['school_type'].value_counts().plot(kind='pie', autopct='%0.1f', colors=['g','r'], explode=[0,0.05])
plt.title('school type')
plt.show()

In [None]:
df['classroom'].unique()

# **Dropping Classroom column**

In [None]:
df.drop('classroom', axis=1, inplace=True)

In [None]:
df.head()

# **Teaching method**

In [None]:
df['teaching_method'].unique()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df['teaching_method'].value_counts().plot(kind='pie', autopct='%0.1f', colors=['g','r'], explode=[0,0.05])
plt.title('Teaching Method')
plt.show()

# **Number of students in class**

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df.n_student.plot(kind='hist', rwidth=0.95)
plt.title('Students Frequency')
plt.show()


# **Number of students from urban, suburban and rural areas**

In [None]:
urban_students = df['n_student'][df['school_setting'] == 'Urban'].sum()
sub_urban_students = df['n_student'][df['school_setting'] == 'Suburban'].sum()
rural_students = df['n_student'][df['school_setting'] == 'Rural'].sum()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7,7))
plt.pie([urban_students, sub_urban_students, rural_students], labels=['Urban', 'Suburban', 'Rural'], autopct='%0.1f', shadow=True)
plt.title('Students from different areas')
plt.legend()
plt.show()

# **Teaching Methods in different areas**

In [None]:
urban_teaching_method = df['teaching_method'][df['school_setting'] == 'Urban'].value_counts()
sub_urban_teaching_method = df['teaching_method'][df['school_setting'] == 'Suburban'].value_counts()
rural_teaching_method = df['teaching_method'][df['school_setting'] == 'Rural'].value_counts()


teaching_method_in_different_areas = {'Urban':urban_teaching_method, 
                                      'Sub Urban':sub_urban_teaching_method, 
                                      'rural':rural_teaching_method
                                     }


In [None]:
for key,col in teaching_method_in_different_areas.items():
    plt.style.use('fivethirtyeight')
    plt.figure(figsize=(7,7))
    col.plot(kind='pie', autopct='%0.1f', colors=['y','b'], shadow=True)
    plt.title(f'Teaching Method in {key} ({col.sum()})')
    plt.show()
    print('\n')

# **School Type in different areas**

In [None]:
urban_school_type = df['school_type'][df['school_setting'] == 'Urban'].value_counts()
sub_urban_school_type = df['school_type'][df['school_setting'] == 'Suburban'].value_counts()
rural_school_type = df['school_type'][df['school_setting'] == 'Rural'].value_counts()


school_type_in_different_areas = {'Urban':urban_school_type, 
                                  'Sub urban':sub_urban_school_type, 
                                  'rural':rural_school_type
                                }

In [None]:
for key,col in school_type_in_different_areas.items():
    plt.style.use('fivethirtyeight')
    plt.figure(figsize=(7,7))
    col.plot(kind='pie', autopct='%0.1f', colors=['grey','pink'], shadow=True)
    plt.title(f'School type in {key} ({col.sum()})')
    plt.show()
    print('\n')

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df['gender'].value_counts().plot(kind='pie', autopct='%0.1f', colors=['b','g'], shadow=True)
plt.title('Gender')
plt.show()

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7, 7))
df['lunch'].value_counts().plot(kind='pie', autopct='%0.1f', colors=['g','r'], explode=[0,0.05])
plt.title('Lunch')
plt.show()

# **Pre-Test scores in Standard and Experimental teaching method**

In [None]:
standard_scores_pre = df['pretest'][df['teaching_method'] == 'Standard']
experimental_scores_pre = df['pretest'][df['teaching_method'] == 'Experimental']

scores_in_diff_methods_pre = {
    'Standard': standard_scores_pre,
    'Experimental': experimental_scores_pre
}

In [None]:
for key, col in scores_in_diff_methods_pre.items():
    plt.style.use('fivethirtyeight')
    plt.figure(figsize=(7, 7))
    col.plot(kind='hist', rwidth=0.95)
    plt.title(f'{key} Teaching Method Pre-Test Scores ({len(col)})')
    plt.ylabel('Marks')
    plt.show()
    print('\n')

### **As we can see Experimental students are more likely to score high marks than standard students in pretest**

# **Post-Test scores in Standard and Experimental teaching method**

In [None]:
standard_scores_post = df['posttest'][df['teaching_method'] == 'Standard']
experimental_scores_post = df['posttest'][df['teaching_method'] == 'Experimental']

scores_in_diff_methods_post = {
    'Standard': standard_scores_post,
    'Experimental': experimental_scores_post
}

In [None]:
for key, col in scores_in_diff_methods_post.items():
    plt.style.use('fivethirtyeight')
    plt.figure(figsize=(7, 7))
    col.plot(kind='hist', rwidth=0.95)
    plt.title(f'{key} Teaching Method Post-Test Scores ({len(col)})')
    plt.ylabel('Marks')
    plt.show()
    print('\n')

### **As we can see Experimental students are more likely to score high marks than standard students in posttest**

## **Let's See which gender scored more marks**

In [None]:
df['pretest'][df['gender'] == 'Female'].plot(kind='hist', rwidth=0.95)
plt.title('Female Pretest Histogram')
plt.show()

In [None]:
df['pretest'][df['gender'] == 'Male'].plot(kind='hist', rwidth=0.95)
plt.title('Male Pretest Histogram')
plt.show()

In [None]:
df['posttest'][df['gender'] == 'Female'].plot(kind='hist', rwidth=0.95)
plt.title('Female Posttest Histogram')
plt.show()

In [None]:
df['posttest'][df['gender'] == 'Male'].plot(kind='hist', rwidth=0.95)
plt.title('Male Posttest Histogram')
plt.show()

## **One hot Encoding**

In [None]:
df.head()

In [None]:
df.school_setting.unique()

In [None]:
school_setting_dummies = pd.get_dummies(df['school_setting'], drop_first=True)

In [None]:
df.drop(['school_setting'], axis=1, inplace=True)

In [None]:
df['school_type'].value_counts()

In [None]:
df['school_type'].replace(['Public', 'Non-public'],[1, 0], inplace=True)

In [None]:
df['teaching_method'].value_counts()

In [None]:
df['teaching_method'].replace(['Standard', 'Experimental'],[1,0], inplace=True)

In [None]:
df['gender'].value_counts()

In [None]:
df['gender'].replace(['Male', 'Female'],[1,0], inplace=True)

In [None]:
df['lunch'].value_counts()

In [None]:
df['lunch'].replace(['Does not qualify', 'Qualifies for reduced/free lunch'],[1,0], inplace=True)

In [None]:
df.shape

In [None]:
new_df = pd.concat([school_setting_dummies,df], axis=1)
new_df.shape

In [None]:
new_df.head()

In [None]:
df[['n_student', 'pretest']].describe()

## **Scaling n_student and pretest** 

In [None]:
new_df['n_student']  = new_df['n_student']/100
new_df['pretest'] = new_df['pretest']/100

In [None]:
# I scaled the values so that now all the values will range between 0-1

In [None]:
new_df.head()

## **Splitting and Training the values** 

In [None]:
x, y = new_df.drop('posttest',axis=1), new_df['posttest']

In [None]:
x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
model =LinearRegression()
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
model.score(x_train, y_train)

## **Cross val scores** 

In [None]:
models = [LinearRegression(), Ridge(), Lasso(), SVR(kernel='linear'), KNeighborsRegressor()]

In [None]:
for model_ in models:
    print('Model:',model_)
    sc = cross_val_score(model_, x,y, cv=10)
    print(sc)
    print(sc.mean())
    
    print('\n')

## **Since Linear Regression given high accuracy we proceed with it**

## **Metric evaluations and correlation**

In [None]:
y_test_predict = model.predict(x_test)
y_train_predict = model.predict(x_train)

In [None]:
mean_squared_error(y_test, y_test_predict)

In [None]:
mean_squared_error(y_train, y_train_predict)

In [None]:
r2_score(y_test, y_test_predict)

In [None]:
r2_score(y_train, y_train_predict)

In [None]:
test = pd.DataFrame({
    'Y test':y_test,
    'Y test predicted':y_test_predict
})


train = pd.DataFrame({
    'Y train':y_train,
    'Y train predicted':y_train_predict
})






In [None]:
test.corr()

In [None]:
train.corr()

In [None]:
test.sample(10)

In [None]:
train.sample(10)

## **Making model as file**

In [None]:
import joblib
joblib.dump(model, 'predict-student-score-model')

In [None]:
!ls