In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.max_columns', None) #display all the columns
df=pd.read_csv("/kaggle/input/factors-affecting-campus-placement/Placement_Data_Full_Class.csv")
df.head()

**Now to understand what each column means**
1. sl_no -> Serial number for every row 
2. gender - > gender of the person , M for male , F for female
3. ssc_p ->  10th Grade percentage 
4. ssc_b ->  10th Grade Board of Education - Central or Others 
5. hsc_p -> 12th Grade percentage
6. hsc_b -> 12th Grade Board of Education- Central or Others
7. hsc_s -> Specialization in Higher Secondary Education 
8. degree_p -> Degree Percentage
9. degree_t -> Degree type 
10. workex -> Work Experience
11. etest_p -> Employability test percentage
12. specialisation -> Post Graduation(MBA)- Specialization
13. mba_p -> MBA percentage
14. status -> Status of placement
15. salary -> Salary offered if placed 

Now that we have a clear picture about the meaning of each column, we can now unerstand what should be our end goal 

For the above given data we can have a total of 3 tasks 
1. Given the data , predict wether a candidate will get placed 
2. Predict what kind of placement package will be offered to the candidate - > can be used for negotiation 
3. Predict wether the candidate will get placed if yes then go predict his salary 


> Performing EDA to better understand the data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

We can see that salary has null values , this is because the candidate didn't get placed , so naturally their salary value will be empty / 0


In [None]:
#fill the null values in the dataset 
df["salary"]=df["salary"].fillna(0)

> ***Visulaizing the given data***

In [None]:
sns.countplot(x = "gender", hue = "gender" ,data = df)

In [None]:
sns.pairplot(df)

In [None]:
categorical = ["gender","ssc_b","hsc_b","hsc_s","degree_t","workex","specialisation","status"]
plt.figure(figsize = (25, 20))
plotnumber = 1

for col in categorical:
    if plotnumber <= 9: 
        ax = plt.subplot(3, 3, plotnumber)
        sns.countplot(df[col])
        plt.xlabel(col, fontsize = 15)
        
    plotnumber += 1
plt.tight_layout()
plt.show()

Label Encoding the categorical columns 

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in categorical:
    df[i]=le.fit_transform(df[i]) 

Now we will check for any outliers in the dataset 

In [None]:
plt.figure(figsize = (20, 15))
plotnumber = 1

for col in df.columns:
    if plotnumber <= 8:
        ax = plt.subplot(3, 3, plotnumber)
        sns.boxplot(df[col])
        plt.xlabel(col, fontsize = 15)
    
    plotnumber += 1
plt.tight_layout()
plt.show()

In [None]:
import plotly.graph_objects as go
corr = df.corr()
graph = go.Figure()
graph.add_trace(go.Heatmap(z=corr.values, x=corr.index.values, y=corr.columns.values))
graph.show()

As we cam see that there are a couple of outliers in the dataset ,we can use the following ways to deal with them 
1. Choose a model which is not Affected by outliers 
2. Remove or transform the data -> we can't remove the data since we only have 215 rows

However we have another issue with the given dataset , that is imbalenced data distribution
1. To deal with this we will be using the smote and smogn packages 
2. SMOTE is used for classification
3. SMOGN is used for regression 

But first we will be dropping the unnessary columns present in the dataset and create two seperate dataset , one for regression and other for classification

In [None]:
import copy
df=df.drop("sl_no",axis=1)
df_reg = copy.deepcopy(df)         
df_class = copy.deepcopy(df)

In [None]:
df_class=df_class.drop("salary",axis=1)

In [None]:
df_reg

In [None]:
df_class

In notebooks it is not necessary to use the head functions to view the data
1. The reason i split the dataframes is so that we can deal with the imbalence problem easily

In [None]:
!pip install smogn

In [None]:
import smogn
df_reg_smogn = smogn.smoter(
    data = df_reg,       #dataset 
    y = 'salary'         #label for the prediction ,i.e in our case salary
)

In [None]:
sns.kdeplot(df_reg['salary'], label = "Original")           #inblue
sns.kdeplot(df_reg_smogn['salary'], label = "Modified")     #in orange

1. Now that we have dealt with the imbalance issue we can move onto splitting the data and scaling it
2. Since our dataset dosen't have many rows i have avoided dropping the rows with outliers in it
3. the above is also the reason i have reduced the test_size

In [None]:
X = df_reg_smogn.iloc[:,:-1].values
y = df_reg_smogn.iloc[:,-1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15)


Now that we have dealt with the imbalance problem for the regression dataset and aslo split it into training and testing, we will now move on to select a model 

There are a lot of models to select from , but remember that our data has outliers present in it 
1. This means we can't select models that are sensitive to outliers -> this means treebased models or XGB  model and a few others are viable
2. Some models don,t require scaling so, it's better to avoid it whenever it's not necessary
3. When confused to select a model simply call the lazypredict
4. Lazypredict trains models which require scaling , so we will scale the data before calling it 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) 

In [None]:
!pip install lazypredict

1. Lazypredict trains the data on all the default sklearn models and returns the metrics report as shown bellow ,it is useful when you have no idea on what model to use.
2. However it only trains the base models , we can improve the accuracy of the models listed with hyper parameter optimization


In [None]:
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(ignore_warnings=True, custom_metric=None)     
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

As we can see that the XGBRegressor and the RandomForest model are the ones with the highest accuracy , this is only to be ecpected since both the models aren't affected by outliers 

In [None]:
from xgboost import XGBRegressor
xgbr = XGBRegressor(verbosity=0) 
xgbr.fit(X_train,y_train)
#bellow is the Adjusted R-Squared for the model
1 - (1-xgbr.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

1. I will not be optimising any hyperparameters simply because Adjusted R-Squared value of .94 is pretty good
2. However the data available is extremely limited so , the actual performance of the model is questionable

Now to move onto classification

In [None]:
X = df_class.iloc[:,:-1].values
y = df_class.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
print(X_train.shape,X_test.shape)

Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Calling the lazypredict again

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train,y_test)
models

Now lets try doing that again but this time using smote

In [None]:
X = df_class.iloc[:,:-1].values
y = df_class.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
print(X_train.shape,X_test.shape)


In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train.ravel())
print(X_train.shape,X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train,y_test)
models

We can see a small improvement in the balanced accuracy 

Now to train the model present and optimize it

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
bc = BernoulliNB()
bc.fit(X_train,y_train)
print(classification_report(y_test,bc.predict(X_test)))

Now to improve the performance of the model with hyperparameter optimisation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score
li = np.arange(0.0, 10.0, 0.1)    
li_ = [i for i in range(10)]


params = {'alpha': li ,
         'binarize' : li_,
         'fit_prior' : [True,False]
         }


bernoulli_nb_grid = GridSearchCV(BernoulliNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5 ,scoring = 'balanced_accuracy')
bernoulli_nb_grid.fit(X_train,y_train)



print('Test Accuracy : %.3f'%bernoulli_nb_grid.best_estimator_.score(X_test, y_test))


print('Best Accuracy Through Grid Search : %.3f'%bernoulli_nb_grid.best_score_)
print('Best Parameters : ',bernoulli_nb_grid.best_params_)

I will try o keep updating the notebook as time goes on , Thanks for your time 