# 1. Importing the relevant libraries and data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

In [None]:
df = pd.read_csv("../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
df.drop(["sl_no", "salary"], axis=1, inplace=True)
df.head()

Dropping **sl_no** and **salary**.
- Dropping salary because it makes no sense to use salary to predict if someone is placed.

# 2. Feature Eng.

In [None]:
df.replace({'Placed':1, 'Not Placed':0}).corr(method='spearman')

Using spearman corr. coeff. to find corr. between columns.

## Using chi-squared corr. 

* alpha = 0.05
* Null = Both features are not correlated.
* Alternate = Both features are correlated

In [None]:
print("P>0.05 (ACCEPT NULL)")
print("_"*50)
print("P value for status and degree_t: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['degree_t']))[1]))
print("P value for status and hsc_s: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['hsc_s']))[1]))
print("P value for status and ssc_b: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['ssc_b']))[1]))
print("P value for status and hsc_b: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['hsc_b']))[1]))
print("P value for status and gender: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['gender']))[1]))
print("\n")
print("P<0.05 (REJECT NULL)")
print("_"*50)
print("P value for status and specilisation: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['specialisation']))[1]))
print("P value for status and workex: {:.3f}".format(stats.chi2_contingency(pd.crosstab(df['status'], df['workex']))[1]))


Dropping non correlated features.

In [None]:
df.drop(["degree_t", "hsc_s", "hsc_b", "ssc_b", "gender"], axis=1, inplace=True)
df.replace({'No':0, 'Yes':1}, inplace = True)
df.replace({'Mkt&HR':0, 'Mkt&Fin':1}, inplace = True)
df.replace({'Not Placed':0, 'Placed':1}, inplace = True)

df.head()

# 3. MODEL TRANING: 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.25, random_state=42)

Splitting dataframe into test and train.

### Using RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=12,max_depth=8, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(" accuracy score: {:.2f}%".format(accuracy_score(y_test, y_pred)*100))

### ACCU : 85.19%

# Thank you!!!!