## importing the tools

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import missingno as misno

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
df.head()

## EDA

In [None]:
misno.matrix(df);

* No Nan values spotted.

In [None]:
df.dtypes

Most of the features are object data type which will affect our model afterwards by the way labeling it.

In [None]:
plt.figure(figsize=(10,6))
plt.xticks(rotation=90)
plt.bar(df.columns,df.nunique());
df.nunique()

See numbers of unique values which if there is a constant feature to remove.

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(df['gender']);

most of the samples are females which will make the data more prone to the females gender

In [None]:
sns.pairplot(df)
plt.show()

We see numerical columns have a positive correlation with each other.

In [None]:
sns.heatmap(df.corr(),annot=True);

After seeing this correlation and see the pattern between writing score and reading score we may creat a feature by dividing them together or merging them

In [None]:
for i in df.columns:
    if df[i].dtypes != "O":
        sns.distplot(df[i],hist_kws=dict(edgecolor="k", linewidth=1,color='k'),color='b')
        plt.show()


## Study the data

In [None]:
df.groupby(["parental level of education"]).mean().plot.bar()
plt.show()

By looking at the bar plot we see that master's degree score in math,reading and writing is the highest which is reasonable

In [None]:
df_fem = df[df["gender"]=="female"]
plt.xticks(rotation=90)
plt.title("female education")
plt.bar(df_fem["parental level of education"].unique(),df_fem["parental level of education"].value_counts(),color=["purple"]);

Most of females have associate's degree and some collage

In [None]:
df_male = df[df["gender"]=="male"]
plt.xticks(rotation=90)
plt.title("male education")
plt.bar(df_male["parental level of education"].unique(),df_male["parental level of education"].value_counts());

Most of males have associate's degree, some collage and high school degree

In [None]:
df.groupby(["lunch"]).mean().plot.bar()
plt.show()

Most of people have a standard lunch , will the type of lunch affect the scores?

In [None]:
plt.bar(df_fem["lunch"].unique(),df_fem["lunch"].value_counts(),color=["purple"]);
print(df_fem["lunch"].value_counts())

63.5% of females are having a standard lunch

In [None]:
plt.bar(sorted(df_male["lunch"].unique(),reverse=True),df_male["lunch"].value_counts());
print(df_male["lunch"].value_counts())

65.5% of males are having a standard lunch 

In [None]:
pivot = pd.pivot_table(data = df, index = ["parental level of education"], columns = ["race/ethnicity"], aggfunc = {'math score' : np.mean})
hm = sns.heatmap(data = pivot, annot = True,cmap="flare")
bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

Group E has the highest math score

In [None]:
pivot = pd.pivot_table(data = df, index = ["parental level of education"], columns = ["race/ethnicity"], aggfunc = {'writing score' : np.mean})
hm = sns.heatmap(data = pivot, annot = True,cmap="flare")
bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

Group E has the highest writing score

In [None]:
pivot = pd.pivot_table(data = df, index = ["parental level of education"], columns = ["race/ethnicity"], aggfunc = {'reading score' : np.mean})
hm = sns.heatmap(data = pivot, annot = True,cmap="flare")
bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

group E has the highest math ,reading and writing score 

#### what degrees are in these groups and how many ?

In [None]:
index = {0:"Group A",1:"Group B",2:"Group C",3:"Group D",4:"Group E"}

A = df[df["race/ethnicity"]=="group A"]
B = df[df["race/ethnicity"]=="group B"]
C = df[df["race/ethnicity"]=="group C"]
D = df[df["race/ethnicity"]=="group D"]
E = df[df["race/ethnicity"]=="group E"]

In [None]:
for i,v in enumerate([A,B,C,D,E]):
    v["parental level of education"].value_counts().plot(kind="bar")
    plt.title(f"degrees and count in {index[i]}")
    plt.xticks(rotation=45)
    plt.show();

In [None]:
for i,v in enumerate([A,B,C,D,E]):
    print(index[i]," : \n")
    print(v["parental level of education"].value_counts())
    print("total number of students :" ,sum(v["parental level of education"].value_counts()))
    print("\n--------------------------------------------------")
    

we see that most of Group E are associate's degree and some collage degree which affected the scores ,but still less than Group C,B,D as numbers of these degrees 

can this change affected by the gender?

In [None]:
plt.figure(figsize=(12,10))
sns.countplot(df['race/ethnicity'], hue = df['gender']);

Seem like group E has male count nearly equal to the female count same as group D ,so the scores might be affected by other external factors like the scoring method or how they assign students to each group

## Model

Predicting the gender column

## generating new features

In [None]:
df["red/wri"] = df["reading score"]/df["writing score"] 
df["gend/gro"] = df["gender"]+df["race/ethnicity"]

In [None]:
def linearlabel(df,columns=None):
    if columns != None and type(columns) == list:
        df = pd.get_dummies(data=df,columns=columns,drop_first=True)
        
    return df

In [None]:
df = linearlabel(df,columns=["gender","race/ethnicity","parental level of education","lunch","test preparation course","gend/gro"]) # one hot encoding for a linear model

In [None]:
df.head()

### Splitting the data

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop("gender_male",axis=1)
y = df["gender_male"]

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

### Using SVC model

In [None]:
from sklearn.svm import SVC

clf = SVC(kernel="linear")

clf.fit(X_train,y_train)

In [None]:
from sklearn.metrics import f1_score,confusion_matrix

print("f1 score training : ",f1_score(y_train,clf.predict(X_train)))
print("f1 score testing : ",f1_score(y_test,clf.predict(X_test)))

sns.heatmap(confusion_matrix(y_test,clf.predict(X_test)),annot=True);

### Using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_2 = LogisticRegression()

clf_2.fit(X_train,y_train)

In [None]:
print("f1 score training : ",f1_score(y_train,clf_2.predict(X_train)))
print("f1 score testing : ",f1_score(y_test,clf_2.predict(X_test)))

sns.heatmap(confusion_matrix(y_test,clf_2.predict(X_test)),annot=True);