# Predict Students Performance in Exams with MultiOutputRegressor

To understand the influence of the parents background, test preparation etc on students performance

```
Step 1. Data Load & EDA
Step 2. Visualization
     2-a. Each X's distribution
     2-b. Each y's distribution
     2-c. 'X & y's distribution
Step 3. Data Preprocessing
Step 4. Modeling & Prediction
```

### Step 1. Data Load & EDA

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd

In [None]:
cd /input

In [None]:
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
for col in df.iloc[:,:-3].columns:
    print('column : {}'.format(col))
    print('Unique values : {}'.format(df[col].unique()))
    print('------------------------------')

### Step 2. Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
plt.style.use('dark_background')

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 

- **Each X's distribution**

In [None]:
def show_values(col):
    index = df[col].value_counts().index
    values = df[col].value_counts().values
    
    plt.figure(figsize=(10,4))
    plt.title(col)
    sns.barplot(x=index, y=values)
    plt.show()

In [None]:
for col in df.iloc[:,:-3].columns:
    show_values(col)

- **Each Y's distribution**

In [None]:
def show_dist(col):
    plt.figure(figsize=(10,4))
    plt.title(col)
    sns.distplot(df[col])
    plt.show()

In [None]:
for col in df.iloc[:,-3:]:
    show_dist(col)

- **'X & Y's distribution**

In [None]:
df.columns

In [None]:
def show_score(col):
    for score in df.iloc[:,-3:].columns:
        sns.barplot(x=col, y=score, data=df)
        plt.title('{} & {}'.format(col, score))
        plt.xticks(rotation=90)
        plt.show()

In [None]:
df.groupby('gender').mean()

In [None]:
show_score('gender')

In [None]:
df.groupby('race/ethnicity').mean()

In [None]:
show_score('race/ethnicity')

In [None]:
df.groupby('parental level of education').mean()

In [None]:
show_score('parental level of education')

In [None]:
df.groupby('lunch').mean()

In [None]:
show_score('lunch')

In [None]:
df.groupby('test preparation course').mean()

In [None]:
show_score('test preparation course')

### Step 3. Data Preprocessing

In [None]:
df.info()

In [None]:
X = df.iloc[:,:-3]
y = df.iloc[:,-3:]

In [None]:
X

In [None]:
y

In [None]:
X_encoded = pd.get_dummies(X)
X_encoded

In [None]:
new_df = pd.concat([X_encoded,y], axis=1)
new_df

In [None]:
new_X = new_df.iloc[:,:-3]
new_y = new_df.iloc[:,-3:]

In [None]:
print(new_X.shape, new_y.shape)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.25, random_state=111)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Step 4. Modeling

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score

In [None]:
rfr = RandomForestRegressor(n_estimators=100, random_state=0)
mr = MultiOutputRegressor(estimator=rfr)
mr.fit(X_train, y_train)

In [None]:
y_pred = mr.predict(X_train)

In [None]:
r2_score(y_train, y_pred)

In [None]:
mr.score(X_test, y_test)