In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
data.head(3)

## Missing Values/Null Values

In [None]:
data.info()

In [None]:
data.isnull().any()

In [None]:
data.describe()

### Gender

In [None]:
data['gender'] = data['gender'].apply(lambda x : x.title())
data = data[data['math score'] > 0]

sns.set_color_codes("muted")
sns.set_style('darkgrid')
sns.set(font_scale = 1.25)

fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))

fig.suptitle('Gender Proportion', fontsize = 20)
sns.countplot(data['gender'], ax = ax1, palette = 'Paired')
ax2.pie(data['gender'].value_counts(), labels = ['Female', 'Male'], explode=(0.1, 0), autopct = '%1.1f%%', shadow = True, colors = ['lightblue', 'b'])
plt.show()

### Race/Ethnicity

In [None]:
race = data['race/ethnicity'].value_counts().to_frame('Count')
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))

fig.suptitle('Ethnicity Proportion', fontsize = 20)
sns.countplot(data['race/ethnicity'].sort_values(), ax = ax1)
ax2.pie(race['Count'], labels = race.index, autopct = '%1.1f%%', shadow = True, explode = (0.1, 0, 0, 0, 0))
plt.show()

### Parental level of education

In [None]:
level = data['parental level of education'].value_counts().to_frame('Count')

fig, ((ax1), (ax2)) = plt.subplots(nrows = 2, ncols = 1, figsize = (12, 15))

fig.suptitle('Parental Level of Education Proportion', fontsize = 20)
sns.countplot(data['parental level of education'].sort_values(), ax = ax1)
ax2.pie(level['Count'], labels = level.index, autopct = '%1.1f%%', shadow = True)
plt.show()

### Lunch

In [None]:
lunch = data['lunch'].value_counts().to_frame('Count')

sns.set_style('darkgrid')
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))

fig.suptitle('Lunch Proportion', fontsize = 20)
sns.countplot(data['lunch'], ax = ax1, palette = 'hls')
ax2.pie(lunch['Count'], labels = lunch.index, explode=(0.1, 0), autopct = '%1.1f%%', shadow = True, colors = ['pink', 'lightblue'])
plt.show()

### Test preparation course

In [None]:
test = data['test preparation course'].value_counts().to_frame('Count')

fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))

fig.suptitle('Test Preparation Course Proportion', fontsize = 20)
sns.countplot(data['test preparation course'], ax = ax1, palette = 'magma')
ax2.pie(test['Count'], labels = test.index, explode=(0.1, 0), autopct = '%1.1f%%', shadow = True, colors = ['purple', 'b'])
plt.show()

### Correlation Matrix

In [None]:
plt.figure(figsize = (7, 7))
sns.heatmap(data.corr())
plt.show()

+ Looks like if a student is good at reading might also be good at writing but can't say anything about maths. Here I'm only predicting maths performance.

# Analysis on Target Values

### Math score

In [None]:
fig, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize = (12, 6))

fig.suptitle('Math Test Score Distribution', fontsize = 20)
sns.distplot(data['math score'], ax = ax1, color = 'm')
plt.show()

In [None]:
bins = [0, 40, 50, 60, 70, 80, 100]
math = pd.cut(data['math score'], bins).value_counts().to_frame('Count')

plt.figure(figsize = (6, 6))
plt.pie(math['Count'], labels = math.index, autopct = '%1.1f%%', shadow = True)
plt.title('Math Score', fontsize = 20)
plt.show()

In [None]:
plt.figure(figsize = (10, 6))
sns.scatterplot(x = data.index, y = data['math score'])
plt.show()

+ Almost 49% of students have maths score above 60.
+ Let's see male and female students maths score distribution.

In [None]:
male_score = data[data['gender'] == 'Male']
female_score = data[data['gender'] == 'Female']

plt.figure(figsize = (10, 6))
sns.distplot(male_score['math score'], color = 'r', label = 'Male')
sns.distplot(female_score['math score'], color = 'g', label = 'Female')
plt.legend()
plt.show()

+ There are more male students than female students whose math score is greater than 65 or 70.
+ Now, let's look how parental education level is affecting students performance in maths.

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows = 3, ncols = 2, figsize = (12, 12))

axes = [ax1, ax2, ax3, ax4, ax5, ax6]

education_levels = data['parental level of education'].unique()

for i, ax in zip(education_levels, axes):
    
    sns.distplot(data['math score'][data['parental level of education'] == i], label = i, ax = ax)
    ax.legend()

+ Next let's see how Test Preparation Course is affecting students performance.

In [None]:
plt.figure(figsize = (8, 6))
sns.distplot(data['math score'][data['test preparation course'] == 'completed'], label = 'Completed', color = 'b')
sns.distplot(data['math score'][data['test preparation course'] == 'none'], label = 'Not Completed', color = 'r')
plt.legend()
plt.show()

+ So, clearly most of the students who had completed test prepration course scored more.

# Data Preprocessing

In [None]:
data['gender'] = data['gender'].replace({'Male' : 1, 'Female' : 0})

data['test preparation course'] = data['test preparation course'].replace({'completed' : 1, 'none' : 0})

data['lunch'] = data['lunch'].replace({'standard' : 1, 'free/reduced' : 0})

data['parental level of education'] = data['parental level of education'].replace({"bachelor's degree" : 'bachelor degree',
                                                                                  "associate's degree" : 'associate degree',
                                                                                  "master's degree" : 'master degree'})

### Dummy Coding Scheme

In [None]:
edu_dummies = pd.get_dummies(data['parental level of education'])
race_dummies = pd.get_dummies(data['race/ethnicity'])

data = pd.concat([data, edu_dummies, race_dummies], axis = 1)

data.drop(['race/ethnicity', 'parental level of education', 'reading score', 'writing score'], axis = 1, inplace = True)

In [None]:
data.head()

# Model Selection

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop('math score', axis = 1)
y = data['math score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.35)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error

In [None]:
#Logistic Regression
l_reg = LogisticRegression(C = 0.1, solver = "liblinear").fit(X_train, y_train)
l_reg_pred = l_reg.predict(X_test)

#Decision Tree
tree_reg = DecisionTreeRegressor().fit(X_train, y_train)
tree_pred = tree_reg.predict(X_test)

#Random Forest
forest_reg = RandomForestRegressor().fit(X_train, y_train)
forest_pred = forest_reg.predict(X_test)

#K-Nearest Neighbors
neighbors = KNeighborsRegressor(n_neighbors= 15).fit(X_train, y_train)
neighbors_pred = neighbors.predict(X_test)

In [None]:
reg = ['Logistic Regression', 'Decision Tree Regressor', 'Random Forest Regressor', 'KNN Regressor']
pred = [l_reg_pred, tree_pred, forest_pred, neighbors_pred]

for r, p in zip(reg, pred):
    print(r)
    print('Mean Absolute Error:', mean_absolute_error(y_test, p))
    print()

# Let me know in the comment section if can add anytihng else or any better approach for Model Selection.

### Upvote if you like my work
# Thanks.....
