In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Variable Description
   * gender: gender of the student
   * parental level of education:student's education level of parents
   * lunch: price level of students' lunches.
   * test preparation course : whether students attend exam preparation courses
   * math score, reading score, writing score: test scores of each student
   * race/ethnicity: student's race / ethnicity class

## **Load and Check Data**

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import networkx as nx

In [None]:
data=pd.read_csv('../input/students-performance-in-exams/StudentsPerformance.csv')
data.head()

In [None]:
data.shape

In [None]:
# look through the variables 

data.columns

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

### **Descriptive Statistics**

In [None]:
data.describe()

In [None]:
#Unique values in data
data.nunique()

### Percentage Of Gender

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('Gender',fontsize = 20)
data['gender'].value_counts().plot.pie(autopct="%1.1f%%")

### Percentage Of Ethinicity

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)

plt.subplot(142)
plt.title('Ethinicity',fontsize = 20)
data['race/ethnicity'].value_counts().plot.pie(autopct="%1.1f%%")

### Percentage Of Lunch

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('Lunch',fontsize = 20)
data['lunch'].value_counts().plot.pie(autopct="%1.1f%%")

### Percentage Of Parental-Education

In [None]:
plt.figure(figsize=(30,10))
plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9,
                      wspace=0.5, hspace=0.2)
plt.subplot(141)
plt.title('Parental-Education',fontsize = 20)
data['parental level of education'].value_counts().plot.pie(autopct="%1.1f%%")

# **Analysis**
* Correlation Between Scores

In [None]:
#finding mutual relationship or connection between two or more variables

correlation = data.corr()
sns.heatmap(correlation,xticklabels=correlation.columns,yticklabels=correlation.columns,annot=True)

* Pair Plot of Scores

In [None]:
sns.pairplot(data)

**using diffrent plotting methods understand the variable relationships**

In [None]:
sns.relplot(x='math score',y='reading score',hue='gender',data=data)

In [None]:
data["mean score"] = ((data['math score'] + data["reading score"] + data["writing score"]) / 3).round()
data.head()

* Distribution Of Math Score

In [None]:
sns.distplot(data['math score'])

* Distribution Of Reading Score

In [None]:
sns.distplot(data['reading score'])

* Distribution Of Writing Score

In [None]:
sns.distplot(data['writing score'])

* ### Box-Plot for each score to detect Outliers

In [None]:
sns.catplot(x='math score',kind='box',data=data)
sns.catplot(x='reading score',kind='box',data=data)
sns.catplot(x='writing score',kind='box',data=data)

# Data preparation for modelling

In [None]:
from sklearn.preprocessing import LabelEncoder
lc = LabelEncoder()
data['gender'] = lc.fit_transform(data['gender'])
data['race/ethnicity'] = lc.fit_transform(data['race/ethnicity'])
data['parental level of education'] = lc.fit_transform(data['parental level of education'])
data['lunch'] = lc.fit_transform(data['lunch'])
data['test preparation course'] = lc.fit_transform(data['test preparation course'])
data.head()

* 0 : Female
* 1 : Male

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=data['gender'],y=data['mean score'])
plt.xticks(rotation=90)
plt.xlabel('Gender')
plt.ylabel('Mean Score')
plt.title('Score difference between male and female students')
plt.show()

plt.figure(figsize=(10,6))
sns.violinplot(x=data['gender'],y=data['mean score'],split=True)
sns.despine(left=True)
plt.show()

In [None]:
data = data.drop(['math score', 'writing score', 'reading score'],axis = 1)
data.head()

In [None]:
from sklearn.model_selection import train_test_split
y = data['mean score']
x = data.drop(['mean score'], axis  = 1)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

# **Model Building with Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [None]:
model.fit(x_train, y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
difference = abs(predictions - y_test)

In [None]:
difference.mean()

**Average error is 11.03 marks**

### **Word_Cloud:**

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
plt.subplots(figsize=(8,8))
wordcloud=WordCloud(
    background_color='white',
    width=512,
    height=384).generate(' '.join(data))

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('graph.png')
plt.show()

***Please upvote if you liked it :)***