In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

### Declaring a function to check metadata of the dataset.

In [None]:
def master_dataframe(dataframe):
    df_metadata = pd.DataFrame({'Datatype': dataframe.dtypes,
                                "Null Values": dataframe.isna().sum(),  
                                "Null %": round(dataframe.isna().sum()/len(dataframe)*100, 2),
                                "No: Of Unique Values": dataframe.nunique()})
    
    df_describe = dataframe.describe(include='all').T
    
    df_metadata = df_metadata.join(df_describe)  
    
    return df_metadata

In [None]:
raw_data = pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
master_dataframe(raw_data)

Good to see there are no Null values to handle.

### Let's check the distinct values of each column

In [None]:
raw_data['gender'].unique()

In [None]:
raw_data['race/ethnicity'].unique()

In [None]:
raw_data['parental level of education'].unique()

In [None]:
raw_data['lunch'].unique()

In [None]:
raw_data['test preparation course'].unique()

### All the columns are categorical in nature and has valid values i.e. no extra spaces, garbage data etc.
### Our aim is to find out the total marks scored by the student. So, lets introduce out total column and get rid of other columns.

In [None]:
data_total_marks = raw_data.copy()
data_total_marks['Total'] = data_total_marks["math score"] + data_total_marks["reading score"] + data_total_marks["writing score"]
data_total_marks.drop(columns=["math score", "reading score", "writing score"], axis=1, inplace=True)
data_total_marks.head()

### Now let's explore the data.

In [None]:
fig = px.bar(data_frame=data_total_marks, 
             x='race/ethnicity', 
             y='Total', 
             color='gender', barmode='group')

iplot(fig)

We can see, students from group C have performed well as compared to others and among almost all the race\ethnicity females have scored more than males except in group A where males have scored more.

We can say that a female has higher chance of scoring than male.

In [None]:
grp_par_rac = pd.DataFrame(data_total_marks.groupby(by=['race/ethnicity', 'parental level of education'])['Total'].sum()).reset_index()
grp_par_rac.sort_values(by=['Total'], ascending=False, inplace=True)

fig = px.bar(data_frame=grp_par_rac, 
             x='race/ethnicity', 
             y='Total', 
             color='parental level of education',
             barmode='group')

iplot(fig)

In [None]:
fig = px.bar(data_frame=data_total_marks, 
             x='lunch', 
             y='Total', 
             color='race/ethnicity', 
             barmode='group')

iplot(fig)

### Let's check the percentage of totals scored by students with parents who went to high school and some high school.

In [None]:
df_high_school = data_total_marks[(data_total_marks['parental level of education'] == 'high school') | (data_total_marks['parental level of education'] == 'some high school')]
grp_high_school = pd.DataFrame(df_high_school.groupby(by='parental level of education')['Total'].sum()).reset_index()
grp_high_school['Percentage_Scored'] = round(grp_high_school['Total']/grp_high_school['Total'].sum()*100, 2)
grp_high_school

### Lets create some dummy variables.

In [None]:
data_with_dummies = data_total_marks.copy()

In [None]:
# Using Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data_with_dummies['gender'] = le.fit_transform(data_with_dummies['gender'])
data_with_dummies['lunch'] = le.fit_transform(data_with_dummies['lunch'])
data_with_dummies['test preparation course'] = le.fit_transform(data_with_dummies['test preparation course'])
data_with_dummies.head()

In [None]:
data_with_dummies.columns

In [None]:
# Using get_dummies
data_with_dummies = pd.get_dummies(data=data_with_dummies, columns=["race/ethnicity", "parental level of education"], drop_first=True)
data_with_dummies.columns

In [None]:
data_rearranged = data_with_dummies[['gender', 'lunch', 'test preparation course',
       'race/ethnicity_group B', 'race/ethnicity_group C',
       'race/ethnicity_group D', 'race/ethnicity_group E',
       "parental level of education_bachelor's degree",
       'parental level of education_high school',
       "parental level of education_master's degree",
       'parental level of education_some college',
       'parental level of education_some high school', 'Total']]

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data_rearranged.drop(columns=['Total'], axis=1)
vif = pd.DataFrame()
vif['vif'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['features'] = variables.columns
vif

### Split Train & Test Data

In [None]:
data_rearranged.columns

In [None]:
X = data_rearranged.drop(columns='Total', axis = 1)
y = data_rearranged['Total']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
y_pred

In [None]:
sns.scatterplot(x=y_pred, y=y_test)

plt.show()