# Students Performance in Exams : Linear Regression

In [None]:
#import all dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
sns.set()

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 1. Collecting the data

In [None]:
perform = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
perform.head()

In [None]:
perform.info()

In [None]:
perform.describe()

In [None]:
perform.isnull().sum()

## 2. Defining the problem statement

Complete the analysis and build the model that will determine the gender of the student

## 3. Exploratory data analysis

In [None]:
def box_chart(feature):
    female = perform[perform['gender']==1][feature].value_counts()
    male = perform[perform['gender']==0][feature].value_counts()
    df = pd.DataFrame([male,female])
    df.index = ['male','female']
    df.plot(kind='bar',stacked=True, figsize=(10,5))

In [None]:
def cross_tab_(feature):
    print(pd.crosstab(perform[feature],perform['gender']).apply(lambda x: x*100/x.sum(), axis=1))

In [None]:
perform['gender'] = perform['gender'].apply(lambda x: 1 if x == 'female' else 0)

In [None]:
perform.head()

In [None]:
box_chart('test preparation course')
cross_tab_('test preparation course')

In [None]:
box_chart('parental level of education')
cross_tab = pd.crosstab(perform['parental level of education'],perform['gender']).apply(lambda x: x*100/x.sum(), axis=1)
print(cross_tab)

In [None]:
box_chart('race/ethnicity')
cross_tab_('race/ethnicity')

In [None]:
box_chart('lunch')
cross_tab_('lunch')

In [None]:
perform['math score'].value_counts()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)
plt.xlim(30,45)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)
plt.xlim(45,55)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)
plt.xlim(55,65)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)
plt.xlim(65,75)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('math score', hue='gender', data=perform)
plt.xlim(75,85)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('reading score', hue='gender', data=perform)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot('writing score', hue='gender', data=perform)

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(perform.corr(), annot = True)

## 4. Feature Engineering

### 4.1 parental level of education

In [None]:
perform.head()

In [None]:
parent_ed_map = {'some college': 0.43, "associate's degree": 0.47, 'high school': 0.7,
                 'some high school': 0.5, "bachelor's degree": 0.46, "master's degree": 0.3}
perform['parental level of education'] = perform['parental level of education'].map(parent_ed_map)

In [None]:
perform.head()

### 4.2 lunch

In [None]:
perform['lunch'] = perform['lunch'].apply(lambda x: 0 if x == 'free/reduced' else 1)

In [None]:
perform.head()

### 4.3 test preparation course

In [None]:
perform['test preparation course'] = perform['test preparation course'].apply(lambda x: 1 if x == 'completed' else 0)

In [None]:
perform.head()

### 4.4 race/ethnicity

In [None]:
group_map = {'group A': 0.2, 'group B': 0.55, 'group C': 0.6, 'group D': 0.33, 'group E': 0.3}
perform['race/ethnicity'] = perform['race/ethnicity'].map(group_map)

In [None]:
perform.head()

### 4.5 scores

In [None]:
perform['math score'] = perform['math score'].apply(lambda x: x/100)
perform['reading score'] = perform['reading score'].apply(lambda x: x/100)
perform['writing score'] = perform['writing score'].apply(lambda x: x/100)

In [None]:
perform.head()

## 5. Modeling 

### 5.1 Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = perform.drop(['gender', 'reading score'], axis=1)
y = perform['gender']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

### 5.2 Creating and Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
print(model.coef_, model.intercept_)

### 5.3 Model Evaluation

In [None]:
coeff_df = pd.DataFrame(model.coef_, X.columns,columns=['Coefficent'])
coeff_df

### 5.4 Predictions from our Model

In [None]:
predictions = model.predict(X_test)
plt.scatter(y_test, predictions)

In [None]:
copy_pred = (predictions > 0.6)*1

In [None]:
sns.distplot((y_test-predictions), bins = 50)

### 5.5 Model Evaluation Metrics

In [None]:
TP = sum((y_test == copy_pred) & (copy_pred == 1))

In [None]:
FP = sum((copy_pred == 1) & (y_test != copy_pred))

In [None]:
precision = TP/(TP+FP)
print(precision)