In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Predict Using Random Forest Classifier


We are using scikit-learn Random Forest Classifier to predict, if a particular student has already completed **test preparation course** .

* so given how well they did in the course, we predict if they did the preparation course before doing the course.

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as snb

In [None]:
# read csv
df = pd.read_csv("/kaggle/input/students-performance-in-exams/StudentsPerformance.csv")

* Read top few rows from the file using the head() method of Pandas.

In [None]:
df.head()

* Check missing values
* .<code>isnull()</code> and <code>sum()</code> is used to find whether there are any missing values in the CSV file.

In [None]:
df.isnull().sum() # checking missing values

* explore the target

In [None]:
# get test preparation course values count
df['test preparation course'].value_counts()


* Turn categorical values of the target into number

In [None]:
mapping = {"none" : 0, "completed" : 1}
df['test preparation course'] = df['test preparation course'].map(mapping)
df.head()

* plot the pairplot graph of the original dataset using the target 'test preparation course' as the hue of the graph


In [None]:
import seaborn as sns
sns.pairplot(df,hue='test preparation course',palette='Set1')

# Categorical data

In [None]:
df = pd.get_dummies(df, columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch'],drop_first = True)

In [None]:
df.head()

# create X and y
    

In [None]:
X= df.drop("test preparation course", axis = 1)
y = df["test preparation course"]

# Split train and test 

In [None]:
# split train test data set
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y)

## Random Forest Classifier

<p>A random forest is* a meta estimator* that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.</p>

<code>n_estimators</code> <i>integer, optional (default=10)</i>  The number of trees in the forest.<br/>



In [None]:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(n_estimators = 100, max_features= 10) 

**Fit X_Train and y_Train**

In [None]:
RandomForest.fit(X_Train, y_Train) 

* score of the training data 

In [None]:
RandomForest.score(X_Train, y_Train)

* score of the testing data 


In [None]:
RandomForest.score(X_Test, y_Test)

Looks like the model is not able to generalise very well on unseen data. Let's investigate more evaluation metrics

In [None]:
predictions = RandomForest.predict(X_Test)

* classification report of your true labels y_test compared to the predictions

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_Test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_Test,predictions)
fig, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 

In [None]:
n_features = X.shape[1]
plt.barh(range(n_features),RandomForest.feature_importances_)
plt.yticks(np.arange(n_features),df.columns[1:])

1. # Another approach - Logistic Regression and polynomial features

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df["mean_grade"] = (df["math score"] + df["reading score"] + df["writing score"]) / 3

In [None]:
df["math score_squared"] = df["math score"] * df["math score"]
df["reading score_squared"] = df["reading score"] * df["reading score"]
df["writing score_squared"] = df["writing score"] * df["writing score"]

In [None]:
df.columns

In [None]:
#X= df[['math score', 'reading score','writing score', 'gender_male','mean_grade', 'math score_squared', 'reading score_squared','writing score_squared']]
X = df.drop("test preparation course", 1)
y = df["test preparation course"]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
# split train test data set
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_Train, y_Test = train_test_split(X, y)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression() 
model.fit(X_Train, y_Train) 
print (model.score(X_Train, y_Train))
print (model.score(X_Test, y_Test))