In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline                                 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

In [None]:
df.sample(7)

In [None]:
# look for missing values
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Rename column labels
df.columns = ['gender','race','parent_education','lunch_type','test_completed','math_score','reading_score','writing_score']

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['race'].unique()

In [None]:
df.hist()

In [None]:
df.groupby(['gender']).size()

In [None]:
teju = df.corr()
teju

In [None]:
sns.heatmap(teju,annot=True)

In [None]:
# Create a new column average_score
df['average_score'] = (df['reading_score'] + df['math_score'] + df['writing_score'])/3

# convert average_score to categorical variable # Poor:0~40 Average:40~60 Good:60~80 Excellent:80~100
df['grade'] = 'na'
df.loc[(df.average_score >= 80) & (df.average_score <= 100), 'grade'] = 'excellent' 
df.loc[(df.average_score >= 60) & (df.average_score < 80), 'grade'] = 'good' 
df.loc[(df.average_score >= 40) & (df.average_score < 60), 'grade'] = 'average'
df.loc[(df.average_score >= 0) & (df.average_score < 40), 'grade'] = 'poor'
df.head()

In [None]:
# Number of Students in each Grade category
plt.figure(figsize=(8,6))
sns.countplot(df.grade, order=["poor","average","good","excellent"], palette='Set1')
plt.title('Grade - Number of Students',fontsize=20)
plt.xlabel('Grade', fontsize=16)
plt.ylabel('Number of Student', fontsize=16)

# Countplot

In [None]:
# Gender Countplot
plt.figure(figsize=(12,12))
sns.countplot(df.gender, palette='Set1')
plt.title('Gender Countplot ',fontsize=14)
plt.xlabel('Gender of student', fontsize=10)
plt.ylabel('Number of Student', fontsize=10)


In [None]:
# Race/Ethnicity Countplot
plt.figure(figsize=(10,10))
sns.countplot(df.race, order = ["group A","group B","group C","group D","group E"], palette='Set1')
plt.title('Race of student - Number of Student',fontsize=14)
plt.xlabel('Race of student', fontsize=10)
plt.ylabel('Number of Student', fontsize=10)

# PIECHART

In [None]:
plt.figure(figsize=(30,20))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)

plt.subplot(142)
plt.title('Ethinicity',fontsize = 20)
df['race'].value_counts().plot.pie(autopct="%1.1f%%")

# HEATMAP

In [None]:
plt.figure(figsize=(10,6))
plt.title('Correlation')
sns.heatmap(teju,annot=True,cmap='viridis',linecolor='red')
plt.xticks(rotation=90)
plt.yticks(rotation=90)
plt.show()

# Violin and Swarmplot

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(x=df['gender'],y=df['math_score'],split=True)
sns.despine(left=True)
plt.show()

In [None]:
sns.swarmplot(x=df['gender'],y=df['math_score'])
plt.show()

# SCATTER PLOT

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(df['writing_score'], df['math_score'])
plt.xlabel("Writing_Score", fontsize=16)
plt.ylabel("Math_Score", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(df['reading_score'], df['writing_score'])
plt.xlabel("Reading Score", fontsize=16)
plt.ylabel("Writing Score", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='math_score',y='reading_score', hue='gender',data=df, palette='viridis')

# VIOLIN AND SWARMPLOT

In [None]:
plt.figure(figsize=(10,6))
sns.violinplot(x=df['gender'],y=df['reading_score'],split=True)
sns.despine(left=True)
plt.show()

sns.swarmplot(x=df['gender'],y=df['reading_score'])
plt.show()

In [None]:

#Now exploring the Writing Score:
p= sns.countplot(x = "writing_score" , data = df , palette = "muted")
_ = plt.setp(p.get_xticklabels(),rotation = 90)

In [None]:
#Providing a passingmark criteria which will be used to categorize the students
passmarks = 35

In [None]:
# Now students passing the Writing Exam:
df['WritingPassedStatus'] = np.where(df['writing_score']<passmarks , 'Failed!','Passed!')
df.WritingPassedStatus.value_counts()

# HISTOGRAM

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(16, 5), sharey=True)
axs[0].hist(df["math_score"], bins=20)
axs[0].set_title("Math score")
axs[1].hist(df["reading_score"], bins=20)
axs[1].set_title("Reading score")
axs[2].hist(df["writing_score"], bins=20)
axs[2].set_title("Writing score")
fig.suptitle("Distributed exam scores")

# RELPLOT

In [None]:
sns.relplot(x='reading_score', y = 'writing_score', data = df)

In [None]:
sns.relplot(x='writing_score', y = 'reading_score', data = df)

In [None]:
df["Total marks"] = df["math_score"] + df["reading_score"] + df["writing_score"]
df["Percentage"] = df["Total marks"] / 3
df.head()

# BOXENPLOT

In [None]:
sns.boxenplot(x="lunch_type", y="Percentage", data=df)

In [None]:
sns.boxenplot(x='test_completed', y='Percentage',data = df, palette = "hls")

In [None]:
sns.pairplot(df)

# BARPLOT

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 6), sharey=True)
sns.barplot(x='gender',y='math_score',data=df, ax=axs[0])
sns.barplot(x='gender',y='reading_score',data=df, ax=axs[1])
sns.barplot(x='gender',y='writing_score',data=df, ax=axs[2])

In [None]:
# Race vs Math_score, Reading_score, Writing_score, Average_score
sns.pairplot(df,hue = 'parent_education', diag_kind = 'kde', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},height = 4);

# FACEGRID

In [None]:
# Preparation Test status  vs Math_score
sns.FacetGrid(df, hue="test_completed",size=3).map(sns.distplot,"math_score").add_legend()
plt.show()
# Preparation Test status  vs Reading_score
sns.FacetGrid(df, hue="test_completed",size=3).map(sns.distplot,"reading_score").add_legend()
plt.show()
# Preparation Test status  vs Writing_score
sns.FacetGrid(df, hue="test_completed",size=3).map(sns.distplot,"writing_score").add_legend()
plt.show()
# Preparation Test status  vs Average_score
sns.FacetGrid(df, hue="test_completed",size=3).map(sns.distplot,"average_score").add_legend()
plt.show()

In [None]:
fig,ax = plt.subplots(figsize=(5,5))
ax.pie(df["WritingPassedStatus"].value_counts(),labels=['P','F'],explode=(0.05,0.05),autopct='%1.1f%%',shadow=True,startangle=90)
ax.axis('equal')  
plt.tight_layout()
plt.legend()
plt.show()

# Density Plot

In [None]:
plt.figure(figsize=(16,14))
sns.kdeplot(df.reading_score, df.writing_score)

# JOINTPLOT

In [None]:
plt.figure(figsize=(16,12))
sns.jointplot(x='reading_score',y='writing_score',data=df)

# Train model(Linear regression)

In [None]:
df.head(1)

In [None]:
X = df[['gender','race','parent_education','lunch_type','test_completed','math_score','reading_score']]

In [None]:
y = df['writing_score']

In [None]:
df.head(4)

In [None]:
df['gender'].replace({'male':'0','female':'1'},inplace=True)

In [None]:
df['race'].replace({'group A':'1','group B':'2', 'group C':'3',
                               'group D':'4','group E':'5'},inplace=True)

In [None]:
df['lunch_type'].replace({'free/reduced':'0','standard':'1'},inplace=True)

In [None]:
df['test_completed'].replace({'none':'0','completed':'1'},inplace=True)

In [None]:
df['parent_education'].replace({'some high school':'1','high school':'1',"associate's degree":'2',
                                        'some college':'3',"bachelor's degree":'4',"master's degree":'5'},inplace=True)

In [None]:

# Split the data to train and test
from sklearn import model_selection
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
# implementation of Linear Regression model using scikit-learn and K-fold for stable model
from sklearn.linear_model import LinearRegression
kfold = model_selection.KFold(n_splits=10)
lr = LinearRegression()
scoring = 'r2'
results = model_selection.cross_val_score(lr, X, y, cv=kfold, scoring=scoring)
lr.fit(X_train,y_train)
lr_predictions = lr.predict(X_test)
print('Coefficients: \n', lr.coef_)

In [None]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, lr_predictions))
print('MSE:', metrics.mean_squared_error(y_test, lr_predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lr_predictions)))

In [None]:
from sklearn.metrics import r2_score
print("R_square score: ", r2_score(y_test,lr_predictions))

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state = 42)
dtr.fit(X_train,y_train)
dtr_predictions = dtr.predict(X_test) 

# R^2 Score
print("R_square score: ", r2_score(y_test,dtr_predictions))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 100)
rfr.fit(X_train,y_train)
rfr_predicitions = rfr.predict(X_test) 

# R^2 Score
print("R_square score: ", r2_score(y_test,rfr_predicitions))

# Gardient Boosting

In [None]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
clf.fit(X_train, y_train)
clf_predicitions = clf.predict(X_test) 
print("R_square score: ", r2_score(y_test,clf_predicitions))

# Compare between all the models that we discuss above

In [None]:
y = np.array([r2_score(y_test,lr_predictions),r2_score(y_test,dtr_predictions),r2_score(y_test,rfr_predicitions),
           r2_score(y_test,clf_predicitions)])
x = ["LinearRegression","RandomForest","DecisionTree","Gradient Boost"]
plt.bar(x,y)
plt.scatter(x,y)
plt.title("Comparison of Regression Algorithms")
plt.ylabel("r2_score")
plt.show()

In [None]:
print(r2_score(y_test,lr_predictions))
print(r2_score(y_test,dtr_predictions))
print(r2_score(y_test,rfr_predicitions))
print(r2_score(y_test,clf_predicitions))


#Best model here is linear regression > random forest > Gradient Boosting > Decision Tree

# Thank You!