In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import category_encoders as ce

import lightgbm as ltb
from sklearn import metrics
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.express as px


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#Reading the data into dataframe
df=pd.read_csv('/kaggle/input/predict-test-scores-of-students/test_scores.csv')

# Understanding the data

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
# How the categorical data is distributed
for i in df.select_dtypes(include='object').columns:
    print(i), print(len(df[i].value_counts()))
    print(df[i].value_counts())   

In [None]:
#dropping student_id since its n id column
df=df.drop('student_id',axis=1)
df.head()

****

In [None]:
df.columns

# **EDA**

In [None]:
#Maximum number schools are in urban zone
fig = px.histogram(df, x='school_setting', title="Count of Schools by Zones", color = 'school_setting')
fig.show()

In [None]:
zone = pd.DataFrame(df.groupby(['school_setting'])['posttest','n_student'].mean().reset_index())
zone

In [None]:

fig = px.bar(zone, x='school_setting',y='posttest', title="School's Zone impact on test score", color = 'n_student')
fig.show()

****Studnets studying in Urbans areas and rurals areas are performing less than those in suburbans.****
****Urban schools have more students and this is somewhere indicating the impact of number of students too on the test score.****

In [None]:
fig, ax = plt.subplots(1, 2,figsize=(12, 5))
sns.kdeplot(data=df, x="n_student",ax=ax[0])
sns.boxplot(data=df, x='n_student', ax=ax[1])
plt.show()

****The number of studnets are nearly evenly distributed and the number of students are negatively correlated with the test score.**** 


In [None]:
sns.lmplot(x='posttest',y='n_student',ci = None, data=df, hue='school_setting')
plt.show()

**Number of students have high impact on test scores**

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot(data=df['pretest'], shade=True, label='Pre-test')
sns.kdeplot(data=df['posttest'], shade=True, label='Post-test')
plt.title('Distribution of Pre and Post Tests')
sns.lmplot(x='posttest',y='pretest',ci = None, data=df)
plt.legend()
plt.show()

**PreTest and Post test scores are highly correlated**

In [None]:
#Both genders are equally present
fig = px.histogram(df, x='gender', title="Count of Students by Gender", color = 'gender')
fig.show()

In [None]:
teaching_method = pd.DataFrame(df.groupby(['teaching_method'])['posttest'].mean().reset_index())

In [None]:
#The number of schools with experimental style are less
fig = px.histogram(df, x='teaching_method', title="Count of schools by teaching method", color = 'teaching_method')
fig.show()

In [None]:

fig = px.bar(teaching_method, x='teaching_method',y='posttest', title="Teaching method's impact on Test Score", color = 'posttest')
fig.show()

**The number of schools with experimental startegy are less but the avg score of the students is more than that of the standard ones.**
**Teaching methid is showing a significant impat on test score.**

In [None]:
school = pd.DataFrame(df.groupby(['school','school_type','teaching_method']).agg(n_students=('n_student','size'),posttest= ('posttest','mean')).reset_index())

In [None]:
fig = px.bar(school, x='school',y='n_students', title="Teaching method's impact on Test Score", color = 'teaching_method', hover_data=['school_type','posttest'], text = 'posttest')
fig.show()

**Almost all schools are supporting the Experimental way of teaching, and the results are significantly better.**

**Since we have seen the impact of number of students on the test score, Teaching method is also supporting it as Experimental Teaching method has less number of students**

In [None]:
df.columns

In [None]:
school_type = pd.DataFrame(df.groupby(['lunch','school_type','school_setting']).agg(n_students=('n_student','size'),posttest= ('posttest','mean')).reset_index())
school_type

In [None]:
fig = px.bar(school_type, x='lunch',y='n_students', title="School's Type impact on Test Score", color = 'school_type', hover_data=['lunch','posttest'], text = 'posttest')
fig.show()

**Studnets who doesnt qualify for free/reduced lunch are scoring good score, may be because of better conditions**

**Also students in non public schools are scoring significantly**

In [None]:

fig = px.bar(school_type, x='school_setting',y='n_students', title="School's Zone and number of students impact on Test Score", color = 'lunch', hover_data=['school_type','posttest'], text = 'posttest')
fig.show()

**The test score for suburban students in non public schools who doesnot qualify for free food is significantly better**

In [None]:
# plotting correlation heatmap


# setting the dimensions of the plot
fig, ax = plt.subplots(figsize=(15, 5))
  
# drawing the plot
dataplot = sns.heatmap(df.corr(), cmap="YlGnBu", annot=True, ax=ax)
plt.show()



# Data Preperation

In [None]:
def data_prep(df,low_card_features,high_card_features,n):
    low_card_data=df[low_card_features]
    high_card_data=df[high_card_features]
    
    #creating dummy variable since the cardinality is low and the data is nominal
    low_card_data_encoded=pd.get_dummies(data=low_card_data,drop_first=True)
    
    #Encoding high Cardinality variablea
    encoder=ce.HashingEncoder(cols=high_card_features,n_components=n)
    high_card_data_encoded=encoder.fit_transform(high_card_data)
    df=df.drop(low_card_features,axis=1)
    df=df.drop(high_card_features, axis=1)
    
    #Creating final data frame
    df1=pd.concat([df,low_card_data_encoded,high_card_data_encoded],axis=1)

    return df1
    

In [None]:
low_card_features=['school_setting','school_type','teaching_method','gender','lunch']
high_card_features=['school','classroom']
n=50

df1=data_prep(df,low_card_features,high_card_features,n)
df1.head()

# Modeling

In [None]:
#Using LGBM Regressor
def modeling(df1):
    # Creating train test split
    y = df1['posttest']
    X = df1.drop(columns=['posttest'], axis=1)
    
    model = ltb.LGBMRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
    model.fit(X_train, y_train)
    expected_y  = y_test
    predicted_y = model.predict(X_test)
    print(metrics.r2_score(expected_y, predicted_y))
    print(metrics.mean_absolute_error(expected_y, predicted_y))
    
    return predicted_y

In [None]:
predicted_y=modeling(df1)

In [None]:
print(predicted_y)