In [None]:
import pandas as pd
import numpy as np

from scipy.stats import pearsonr

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

In [None]:
df.describe()

In [None]:
# data preprocessing

group_to_int = {
    'group A' : 0,
    'group B' : 1,
    'group C' : 2,
    'group D' : 3,
    'group E' : 4,
}

def process_group(group):
    return group_to_int[group]

df['race/ethnicity'] = df['race/ethnicity'].apply(process_group)

parental_to_int = {
    "high school" : 0,
    "some high school" : 1,
    "some college" : 1,
    "bachelor's degree" : 2,
    "associate's degree" : 2,
    "master's degree" : 3,
}

def parental_education(deg):
    return parental_to_int[deg]

df['parental level of education'] = df['parental level of education'].apply(parental_education)

lunch_int = {
    "free/reduced" : 0,
    "standard" : 1
}

def lunch_prep(class_):
    return lunch_int[class_]

df['lunch'] = df['lunch'].apply(lunch_prep)

course_int = {
    "none" : 0,
    "completed" : 1
}

def course(class_):
    return course_int[class_]

df['test preparation course'] = df['test preparation course'].apply(course)

gender = {
    'female' : 0,
    'male' : 1
}

def get_gen(gen):
    return gender[gen]

df['gender'] = df['gender'].apply(get_gen)

In [None]:
targets = ['math score', 'reading score', 'writing score']

for target in targets:
    print("Correlation with {}:".format(target))
    for col in list(df.columns):
        print(col, ":", np.corrcoef(df[col], df[target])[0][1])
    print()

In [None]:
plt.figure(figsize=(10,10))
sns.pairplot(df, hue ='gender', kind="reg", plot_kws={'line_kws':{'color':'red'}, 'scatter_kws': {'alpha': 0.3}})

In [None]:
# making a regression model to predict the performance (math score) of a student.

from sklearn.svm import SVR
model = SVR()

# splitting the data into 2 parts, 70% and 30% for training and testing purpose.
df_ = df[:700]
X = df_.drop(['math score', 'reading score', 'writing score'],1)
y = df_['math score']

model.fit(X, y)

In [None]:
# dataset for testing.
df_test = df[:-300]
X_test = df_test.drop(['math score', 'reading score', 'writing score'],1)
pred = model.predict(X_test)

In [None]:
# evaluating the model.
from sklearn.metrics import mean_squared_error
score = model.score(X,y)
print("R-squared error:", score)
print("MSE:", mean_squared_error(y, pred))

In [None]:
# this is all for this dataset, please comment on this notebook for making the prediction model better.