## What are the variables that have the greatest influence on math grades?
## Is it possible to predict student grades based on these variables?

In [1]:
import pandas as pd
df = pd.read_csv("Expanded_data_with_more_features.csv")

In [2]:
df.dtypes

Unnamed: 0               int64
Gender                  object
EthnicGroup             object
ParentEduc              object
LunchType               object
TestPrep                object
ParentMaritalStatus     object
PracticeSport           object
IsFirstChild            object
NrSiblings             float64
TransportMeans          object
WklyStudyHours          object
MathScore                int64
ReadingScore             int64
WritingScore             int64
dtype: object

# Data Cleaning

In [3]:
df.isna().sum()

Unnamed: 0                0
Gender                    0
EthnicGroup            1840
ParentEduc             1845
LunchType                 0
TestPrep               1830
ParentMaritalStatus    1190
PracticeSport           631
IsFirstChild            904
NrSiblings             1572
TransportMeans         3134
WklyStudyHours          955
MathScore                 0
ReadingScore              0
WritingScore              0
dtype: int64

## Drop Columns

In [4]:
df = df.drop("Unnamed: 0", axis=1)

## Drop NaNs

In [5]:
cols_to_drop = ["EthnicGroup","ParentEduc","TestPrep","ParentMaritalStatus","PracticeSport", "IsFirstChild","NrSiblings","TransportMeans","WklyStudyHours"]
df = df.dropna(subset=cols_to_drop, axis=0)

In [6]:
df.isna().sum()

Gender                 0
EthnicGroup            0
ParentEduc             0
LunchType              0
TestPrep               0
ParentMaritalStatus    0
PracticeSport          0
IsFirstChild           0
NrSiblings             0
TransportMeans         0
WklyStudyHours         0
MathScore              0
ReadingScore           0
WritingScore           0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Gender,EthnicGroup,ParentEduc,LunchType,TestPrep,ParentMaritalStatus,PracticeSport,IsFirstChild,NrSiblings,TransportMeans,WklyStudyHours,MathScore,ReadingScore,WritingScore
2,female,group B,master's degree,standard,none,single,sometimes,yes,4.0,school_bus,< 5,87,93,91
4,male,group C,some college,standard,none,married,sometimes,yes,0.0,school_bus,5 - 10,76,78,75
5,female,group B,associate's degree,standard,none,married,regularly,yes,1.0,school_bus,5 - 10,73,84,79
6,female,group B,some college,standard,completed,widowed,never,no,1.0,private,5 - 10,85,93,89
7,male,group B,some college,free/reduced,none,married,sometimes,yes,1.0,private,> 10,41,43,39


# Type of data

Data - Categorical - Ordinal = "ParentEduc", "TestPrep", "PracticeSport"
Data - Categorical - Nominal = "Gender", "EthnicGroup", "LunchType", "ParentMaritalStat", "PracticeSport", "IsFirstChild", "TransportMeans"] 
Data - Numerical - Discrete = "WklyStudyHours", "NrSiblings"
Data - Numerical - Continuous = "MathScore","ReadingScore","WritingScore "

## Transformation of Categorical data 

### Ordinal data - Label incoding
<p>"ParentEduc"</p> 
<p>"TestPrep"</p>
<p>"PracticeSport"</p>

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#### ParentEduc

In [9]:
df["ParentEduc"].unique()

array(["master's degree", 'some college', "associate's degree",
       'high school', 'some high school', "bachelor's degree"],
      dtype=object)

In [10]:
le.fit(["some high school","high school" ,"some college", "associate's degree", "bachelor's degree", "master's degree"])
df["ParentEduc"] = le.transform(df["ParentEduc"])

#### TestPrep

In [11]:
df["TestPrep"].unique()

array(['none', 'completed'], dtype=object)

In [12]:
le.fit(['none', 'completed'])
df["TestPrep"] = le.transform(df["TestPrep"])

#### PracticeSport

In [13]:
df["PracticeSport"].unique()

array(['sometimes', 'regularly', 'never'], dtype=object)

In [14]:
le.fit(["never", "sometimes", "regularly"])
df["PracticeSport"] = le.transform(df["PracticeSport"])

#### WklyStudyHours

In [15]:
df["WklyStudyHours"].unique()

array(['< 5', '5 - 10', '> 10'], dtype=object)

In [16]:
le.fit(['< 5', '5 - 10', '> 10'])
df["WklyStudyHours"] = le.transform(df["WklyStudyHours"])

### Nominal - Getdummies

<p>"Gender"</p>
<p>"EthnicGroup" </p>
<p>"LunchType"</p>
<p>"ParentMaritalStat"</p> 
<p>"PracticeSport"</p> 
<p>"IsFirstChild"</p>
<p>"TransportMeans"</p>

In [17]:
df = pd.get_dummies(df, columns=["Gender", "EthnicGroup", "LunchType", "ParentMaritalStatus", "PracticeSport", "IsFirstChild", "TransportMeans"])

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19243 entries, 2 to 30640
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ParentEduc                    19243 non-null  int32  
 1   TestPrep                      19243 non-null  int32  
 2   NrSiblings                    19243 non-null  float64
 3   WklyStudyHours                19243 non-null  int32  
 4   MathScore                     19243 non-null  int64  
 5   ReadingScore                  19243 non-null  int64  
 6   WritingScore                  19243 non-null  int64  
 7   Gender_female                 19243 non-null  uint8  
 8   Gender_male                   19243 non-null  uint8  
 9   EthnicGroup_group A           19243 non-null  uint8  
 10  EthnicGroup_group B           19243 non-null  uint8  
 11  EthnicGroup_group C           19243 non-null  uint8  
 12  EthnicGroup_group D           19243 non-null  uint8  
 13  E

In [19]:
df.columns

Index(['ParentEduc', 'TestPrep', 'NrSiblings', 'WklyStudyHours', 'MathScore',
       'ReadingScore', 'WritingScore', 'Gender_female', 'Gender_male',
       'EthnicGroup_group A', 'EthnicGroup_group B', 'EthnicGroup_group C',
       'EthnicGroup_group D', 'EthnicGroup_group E', 'LunchType_free/reduced',
       'LunchType_standard', 'ParentMaritalStatus_divorced',
       'ParentMaritalStatus_married', 'ParentMaritalStatus_single',
       'ParentMaritalStatus_widowed', 'PracticeSport_0', 'PracticeSport_1',
       'PracticeSport_2', 'IsFirstChild_no', 'IsFirstChild_yes',
       'TransportMeans_private', 'TransportMeans_school_bus'],
      dtype='object')

# Decision Tree

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor


In [21]:
# Separating the data into independent and dependent variables
# Converting each dataframe into a numpy array
# since each dataframe contains only one column

RAND_STATE = 34 # for reproducible shuffling
TT_RATIO = 0.3 # test/train

dfX = df.drop('MathScore',axis=1)

X=dfX
y=df['MathScore'] 

# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)


## Model Development

In [28]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

DecisionTreeRegressor()

In [29]:
y_pred = model.predict(X_test)

In [30]:
y_pred

array([62., 65., 67., ..., 56., 81., 74.])

In [32]:
dfT = pd.DataFrame({'Real Values':y_test, 'Predicted Values':y_pred})
dfT

Unnamed: 0,Real Values,Predicted Values
10818,63,62.0
2467,60,65.0
24496,69,67.0
27822,73,70.0
29936,73,70.0
...,...,...
11699,73,64.0
583,60,59.0
9125,49,56.0
18596,69,81.0


In [34]:
# Visualising the Decision Tree Regression Results 
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X_test, y_test, color = 'red')
plt.scatter(X_test, y_pred, color = 'green')
plt.title('Decision Tree Regression')
plt.xlabel('Real Values')
plt.ylabel('Predicted Values')
plt.show()

plt.plot(X_grid, regressor.predict(X_grid), color = 'black')
plt.title('Decision Tree Regression')
plt.xlabel('Real Values')
plt.ylabel('Predicted Values')
plt.show()

TypeError: unsupported operand type(s) for -: 'str' and 'str'

## Model Evaluation

In [None]:
y_pred = model.predict(X_test)
print(mean_squared_error(y_test,y_pred))

### OLS

In [None]:
import statsmodels.api as sm
X_train_const_ct = sm.add_constant(X_train.to_numpy()) 

model = sm.OLS(y_train, X_train_const_ct).fit()
predictions_train = model.predict(X_train_const_ct)

X_test_const_ct = sm.add_constant(X_test) 
predictions_test = model.predict(X_test_const_ct)
print_model = model.summary()
print(print_model)