### Importing the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### Importing the Dataset

In [None]:
dataset = pd.read_csv('../input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv')

In [None]:
dataset.head()

In [None]:
# Converting the non-numerical data into catetgorical data

dataset['gender'] = dataset['gender'].astype('object')
dataset['ssc_b'] = dataset['ssc_b'].astype('object')
dataset['hsc_b'] = dataset['hsc_b'].astype('object')
dataset['hsc_s'] = dataset['hsc_s'].astype('object')
dataset['degree_t'] = dataset['degree_t'].astype('object')
dataset['workex'] = dataset['workex'].astype('object')
dataset['specialisation'] = dataset['specialisation'].astype('object')

#Getting all the categorical columns except the target
categorical_columns = dataset.select_dtypes(exclude = 'number').drop('status', axis = 1).columns

print(categorical_columns)

In [None]:
# First considering only numerical values for feature selection
X = dataset.iloc[:,[2,4,7,10,12,14]].values
Y = dataset.iloc[:,13].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
len(dataset)

#### Checking for null values

In [None]:
dataset.isnull().sum()

In [None]:
# So salary column contains null values

### Taking care of missing data in Salary column

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,[5]])
X[:,[5]] = imputer.transform(X[:,[5]])

In [None]:
print(X)

### Feature Selection of Numerical Values

In [None]:
plt.rcParams['figure.figsize']=15,6 
sns.set_style("darkgrid")

x = dataset.iloc[:,[2,4,7,10,12,14]]

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

In [None]:
# So we can conclude that 'Salary' and 'ssc_p' are two relavent features for predicting the status of placement for a student

### Feature Selection of Categorical Data

In [None]:
# Import the function
#from scipy.stats import chi2_contingency
#Testing the relationship
#chi_res = chi2_contingency(pd.crosstab(dataset['status'], dataset['gender']))
#print('Chi2 Statistic: {}, p-value: {}'.format(chi_res[0], chi_res[1]))

In [None]:
from scipy.stats import chi2_contingency
chi2_check = []
for i in categorical_columns:
    if chi2_contingency(pd.crosstab(dataset['status'], dataset[i]))[1] < 0.05:
        chi2_check.append('Reject Null Hypothesis')
    else:
        chi2_check.append('Fail to Reject Null Hypothesis')
res = pd.DataFrame(data = [categorical_columns, chi2_check] 
             ).T 
res.columns = ['Column', 'Hypothesis']
print(res)

In [None]:
# If we choose our p-value level to 0.05, if the p-value test result is more than 0.05 then we fail to reject the Null Hypothesis. 
# This means, there is no relationship between the Feature and Dependent Variable based on the Chi-Square test of independence.
# And if the p-value test result is less than 0.05 then we reject the Null Hypothesis. 
# This means, there is a relationship between the Feature and Dependent Variable based on the Chi-Square test of independence.

In [None]:
# So we conclude that 'workex' and 'specialisation' are two important features for predicting status.

In [None]:
# So after feature selection of categorical and numerical features, X comes as,
X = dataset.iloc[:,[2,9,11,14]].values
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,[3]])
X[:,[3]] = imputer.transform(X[:,[3]])

### Encoding Categorical Values

#### Finding the categories

In [None]:
print(dataset['workex'].unique())
print(dataset['specialisation'].unique())
print(dataset['status'].unique())

### Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
X[:,1] = le1.fit_transform(X[:, 1])
le2 = LabelEncoder()
X[:,2] = le2.fit_transform(X[:, 2])
le3 = LabelEncoder()
Y = le3.fit_transform(Y)

In [None]:
print(X[0])

In [None]:
print(Y)

### Splitting the dataset into training set and test set

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=1)

### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,[0,3]] = sc.fit_transform(x_train[:,[0,3]])
x_test[:,[0,3]] = sc.transform(x_test[:,[0,3]])

### Applying classification models on the Training set

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

In [None]:
mylist = []
mylist2 = []
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('Logistic Regression')

### K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
classifier.fit(x_train,y_train)

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('KNN')

### Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('Naive Bayes')

### Decision Tree Classification

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('Decision Tree')

### Support Vector Classification

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('Support Vector')

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
ac = accuracy_score(y_test, y_pred)
print(ac)
mylist.append(ac)
mylist2.append('Random Forest')

In [None]:
# Plotting the accuracy score for different models
plt.rcParams['figure.figsize']=10,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=mylist2, y=mylist, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 11, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'x-large')
plt.show()