# Libraries

In [None]:
# Basic Import
import numpy as np
import pandas as pd

# Vis.
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import warnings


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Preparation

In [None]:
data = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")
data.head()

In [None]:
#data.columns

In [None]:
data["mean_scores"] = (data["math score"] + data["reading score"] + data["writing score"]) / 3

In [None]:
# There is no missing value so I want to look statistical information with describe()
data.info()

In [None]:
data.describe().T
# data is distributed as normally but,
# math score has 0 point cause math is harder than other all the time.
# when I look the data, I can see min exam_score is 27.
# Values of mean and 50% is so close.

In [None]:
data.corr()
# I want to look before I do get_dummies

In [None]:
import matplotlib.pyplot as plt
# I did that to see histogram plot and distribution of exam_scores

def histogramPlot(variable):
    variable.plot(kind = "hist", density = True, bins = 15)
    variable.plot(kind = "kde");

if __name__=='__main__':
    histogramPlot(data)

In [None]:
data

In [None]:
data.columns

In [None]:
def groupbyFunc(data, feature):
    # The function that you can use to analyze the mean of the features you have given and their situation in the data.
    values = data[feature].value_counts()
    feature_analysis = data.groupby(feature).mean()
    return values,feature_analysis    
    
    
# Firstly
groupbyFunc(data, "parental level of education")


You can reach relationship between your feature and its values so 
- **I can say: when parents of the student were graduated the master degree and bachelor degree, students are better at lessons**

In [None]:
# Secondly
groupbyFunc(data, "race/ethnicity")

 **I'm not racist but**
- We can see obviously group E is the best score for all lessons and
- Group A is the worst score 

In [None]:
# Lastly
groupbyFunc(data, "gender")

Lesson by lesson analysis is :

- **When the result of math score of males is better than females.**
- **The result of reading and writing score of females is better than males so**
- **Females are more successful than males.**

In [None]:
# I have to drop values of outlier scores to take a better rmse value.
sns.boxplot( y = data["math score"])
plt.show()

sns.boxplot(y = data["reading score"] )
plt.show()


sns.boxplot(y = data["writing score"])
plt.show()

sns.boxplot(y = data["mean_scores"])
plt.show()


In [None]:
def drop_outliers(df,column_name,lower,upper):
    drop_outliers = df[column_name].between(df[column_name].quantile(lower), df[column_name].quantile(upper))
    
    print(str(df[column_name][drop_outliers].size) + "/" + str(df[column_name].size) + " data points remain.") 

    index_names = df[~drop_outliers].index
    return df.drop(index_names)


new_data = drop_outliers(data,"mean_scores",0.05,0.95) 

**I dropped outlier values from mean scores before I prepare data that to model and get_dummies function**

In [None]:
print("data:",data.shape)
print("new_data:", new_data.shape)

In [None]:
math_score = new_data["math score"]
reading_score = new_data["reading score"]
writing_score = new_data["writing score"]
mean_score = new_data["mean_scores"]
X_features = new_data.drop(["math score","reading score","writing score","mean_scores"],axis = 'columns') 

**My target value is mean_score so I seperated the data.**

# Get Dummy Function

In [None]:
X_features

In [None]:
X_features_encoded = X_features.apply(lambda x: x.astype('category')) 

X_features_encoded = pd.get_dummies(X_features_encoded,drop_first= True)
X_features_encoded


- **Get dummies function is the best function that I have ever worked because If you want to seperate categorical columns from the data, 'generally' you can use this function and LabelEncoder().** 

In [None]:
mean_score

# Train Test 

In [None]:
target = mean_score
X_train, X_val, y_train, y_val = train_test_split(X_features_encoded, 
                                                      target, 
                                                      test_size=0.4, 
                                                      shuffle = True, 
                                                      random_state=1)

# Modelling

In [None]:
# true ---> real     predicted---> predict
def calculateModel(real, predict):
    rmse = np.sqrt(mean_squared_error(real, predict))
    r2 = r2_score(real, predict)
    print("rmse:",rmse)
    print("r2 score:",r2)

# RMSE - R^2 Score

In [None]:
## Random Forest and Linear Model that I tried to calculate model
print("Random Forest Regressor")
print("------------")
rf = RandomForestRegressor(random_state=0).fit(X_train, y_train)
rf_pred = rf.predict(X_train)
print("Train set of RF")
calculateModel(y_train,rf_pred)

print("------------")
print("Test set of RF")
rf_pred_val= rf.predict(X_val)
calculateModel(y_val,rf_pred_val)

print("------------")


print("Linear Regression")
print("------------")
lr = LinearRegression(normalize=True).fit(X_train, y_train)
lr_pred = lr.predict(X_train)
print("Train set of LR")
calculateModel(y_train,lr_pred)

print("------------")
print("Test set of LR")
lr_pred_val= lr.predict(X_val)
calculateModel(y_val,lr_pred_val)
