In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

In [None]:
sns.countplot(x='race/ethnicity',data=df)

In [None]:
df['gender'].value_counts().plot(kind='bar')

In [None]:
female_count=len(df)-len(df[df['gender']=='male'])
male_count=len(df)-female_count
print('Female Count='+str(female_count)+'\nMale Count='+str(male_count))

In [None]:
print(df['parental level of education'].value_counts())
df['parental level of education'].value_counts().plot(kind='bar')
plt.xlabel('Parental Level of Education')
plt.ylabel('Count')
plt.show()

In [None]:
print(df['lunch'].value_counts())
df['lunch'].value_counts().plot(kind='bar')
plt.xlabel('Lunch Type')
plt.ylabel('Count')
plt.show()

In [None]:
print(df['test preparation course'].value_counts())
df['test preparation course'].value_counts().plot(kind='bar')
plt.xlabel('Test Preparation Course')
plt.ylabel('Count')
plt.show()

In [None]:
sns.histplot(data=df,x='math score',bins=10,kde=True)

In [None]:
sns.histplot(data=df,x='math score',bins=10,kde=True,hue='test preparation course')

In [None]:
plt.title('Math vs Reading Score',size=16)
sns.scatterplot(x='math score',y='reading score',data=df,hue='gender')
plt.xlabel('Math score',size=12)
plt.ylabel('Reading score',size=12)

In [None]:
plt.title('Math vs Writing Score',size=16)
sns.scatterplot(x='math score',y='writing score',data=df,hue='gender')
plt.xlabel('Math score',size=12)
plt.ylabel('Writing score',size=12)

In [None]:
sns.histplot(data=df,x='reading score',bins=10,kde=True,hue='test preparation course')

In [None]:
plt.title('Reading vs Writing Score',size=16)
sns.scatterplot(x='reading score',y='writing score',data=df,hue='gender')
plt.xlabel('Reading score',size=12)
plt.ylabel('Writing score',size=12)

In [None]:
sns.histplot(data=df,x='writing score',bins=10,kde=True)

In [None]:
sns.histplot(data=df,x='writing score',bins=10,kde=True,hue='test preparation course')

In [None]:
sns.histplot(data=df,x='reading score',bins=10,kde=True)

In [None]:
sns.histplot(data=df,x='reading score',bins=10,hue='test preparation course')

In [None]:
df['avg_marks']=(df['math score']+ df['writing score']+ df['reading score'])/3
kde_data=df[['math score','writing score','reading score','avg_marks']]

In [None]:
sns.kdeplot(data=kde_data,shade=True,palette='colorblind')

In [None]:
df.columns

In [None]:
sns.catplot(x='race/ethnicity',y='avg_marks',data=df,
            hue='test preparation course',
            kind='box',showfliers=False)

In [None]:
plt.figure(figsize=(8,4))
plt.rcParams['xtick.labelsize']=5
order = ["master's degree","bachelor's degree",
         "associate's degree","some college",
         "high school","some high school"]
sns.catplot(x='parental level of education',y='avg_marks',hue='test preparation course',
            data=df,order=order,kind='box',showfliers=False)
plt.show()

In [None]:
plt.figure(figsize=(8,4))
plt.rcParams['xtick.labelsize']=5
order = ["master's degree","bachelor's degree",
         "associate's degree","some college",
         "high school","some high school"]
sns.catplot(x='parental level of education',y='avg_marks',
            data=df,order=order,kind='box',hue='lunch',
            showfliers=False)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge,LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
X=df.drop(['math score','reading score','writing score','avg_marks'],axis=1)
y=df['avg_marks']
X=pd.get_dummies(X,drop_first=True)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape,y_train.shape)
print(X_val.shape,y_val.shape)

In [None]:
model1=Ridge()
model1.fit(X_train,y_train)
pred=model1.predict(X_val)
score =  mean_squared_error(y_val,pred,squared=False)
score

In [None]:
model2=LinearRegression()
model2.fit(X_train,y_train)
pred=model2.predict(X_val)
score =  mean_squared_error(y_val,pred,squared=False)
score

In [None]:
model1.coef_

In [None]:
model2.coef_

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)
feature_importance = np.array(model.feature_importances_)
feature_names = np.array(X_train.columns)
data={'feature_names':feature_names,'feature_importance':feature_importance}
df_plt = pd.DataFrame(data)
df_plt.sort_values(by=['feature_importance'], ascending=False,inplace=True)
plt.figure(figsize=(8,6))
sns.barplot(x=df_plt['feature_importance'], y=df_plt['feature_names'])
plt.style.use("ggplot")
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
plt.show()