In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
sns.set()

In [None]:
students=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
students['average score']=round((students['math score']+students['reading score']+students['writing score'])/3,2)

In [None]:
students.head()

In [None]:
students.isna().sum()

In [None]:
students.info()

# Exploratory Analysis

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(students.corr(),annot=True,cmap='Blues',vmin=0,vmax=1)

### High Correlation between all the scores

In [None]:
sns.pairplot(students,hue='gender')

### From the above graph, we can state that there is no significant difference between male and female, but females perform a bit better in language and males in maths.

In [None]:
plt.figure(figsize=(12,8))
sns.histplot(x='average score', data=students,kde=True)

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,4,1)
sns.boxplot(y='math score', data=students)
plt.subplot(1,4,2)
sns.boxplot(y='reading score', data=students)
plt.subplot(1,4,3)
sns.boxplot(y='writing score', data=students)
plt.subplot(1,4,4)
sns.boxplot(y='average score', data=students)
plt.tight_layout()

### Some outlier are present in each group

In [None]:
#Distribution of Categorical Data
for i in students.columns:
    fig=plt.figure(figsize=(10,4))
    if students[i].dtype=='object':
        sns.countplot(x=i,data=students)
        fig.show()

# Data Preprocessing

In [None]:
students_c=students.copy()
students_c.drop('average score',axis=1,inplace=True)

In [None]:
#Removing Outlier
def outlier_r(data):
    q1=data.quantile(.25)
    q3=data.quantile(.75)
    lower=q1 - 1.5 * (q3-q1)
    upper=q3 + 1.5 * (q3-q1)
    return data.between(lower,upper)

In [None]:
students_c=students_c[outlier_r(students_c['math score'])]
students_c=students_c[outlier_r(students_c['reading score'])]
students_c=students_c[outlier_r(students_c['writing score'])]

In [None]:
students_c.shape

In [None]:
target=students_c.loc[:,'math score':'writing score']
students_c.drop(['math score','reading score','writing score'],axis=1,inplace=True)

In [None]:
target.head()

In [None]:
students_c.head()

In [None]:
for i in students_c:
    if students_c[i].dtype=='object':
        a=pd.get_dummies(students_c)

In [None]:
students_c=pd.get_dummies(students_c)

In [None]:
students_c.head()

# Model Creation and Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(students_c, target, test_size=0.25)

In [None]:
LR=LinearRegression()
LR.fit(X_train,y_train)
predict_lr=LR.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test,predict_lr))

In [None]:
predict=pd.DataFrame(y_test)

In [None]:
predict.merge(pd.DataFrame(predict_lr), left_index=True, right_index=True)