# Training dataset with different models

In [2]:
import pandas as pd
import numpy as np
import sidetable
import joblib

In [3]:
d1 = pd.read_csv("Data.csv")

In [4]:
d1

Unnamed: 0,gender,tenth_marks,pu_marks,pu_stream,deg_marks,deg_stream,workex,status
0,M,58.0,70.00,Commerce,61.00,Comm&Mgmt,No,Not Placed
1,M,58.0,61.00,Commerce,60.00,Comm&Mgmt,Yes,Placed
2,M,69.6,68.40,Commerce,78.30,Comm&Mgmt,Yes,Placed
3,F,47.0,55.00,Science,65.00,Comm&Mgmt,No,Not Placed
4,F,77.0,87.00,Commerce,59.00,Comm&Mgmt,No,Placed
...,...,...,...,...,...,...,...,...
327,M,81.1,78.60,Arts,35.84,Sci&Tech,No,Not Placed
328,M,81.3,85.12,Arts,68.50,Sci&Tech,No,Not Placed
329,M,83.5,89.90,Science,76.30,Sci&Tech,No,Not Placed
330,M,85.5,90.63,Science,74.50,Sci&Tech,No,Not Placed


In [5]:
d1.select_dtypes('object')

Unnamed: 0,gender,pu_stream,deg_stream,workex,status
0,M,Commerce,Comm&Mgmt,No,Not Placed
1,M,Commerce,Comm&Mgmt,Yes,Placed
2,M,Commerce,Comm&Mgmt,Yes,Placed
3,F,Science,Comm&Mgmt,No,Not Placed
4,F,Commerce,Comm&Mgmt,No,Placed
...,...,...,...,...,...
327,M,Arts,Sci&Tech,No,Not Placed
328,M,Arts,Sci&Tech,No,Not Placed
329,M,Science,Sci&Tech,No,Not Placed
330,M,Science,Sci&Tech,No,Not Placed


In [6]:
d1['gender'].unique()

array(['M', 'F'], dtype=object)

In [7]:
d1['pu_stream'].unique()

array(['Commerce', 'Science', 'Arts'], dtype=object)

In [8]:
d1['deg_stream'].unique()

array(['Comm&Mgmt', 'Sci&Tech', 'Others'], dtype=object)

In [9]:
d1['workex'].unique()

array(['No', 'Yes'], dtype=object)

In [10]:
d1['status'].unique()

array(['Not Placed', 'Placed'], dtype=object)

In [11]:
d1.stb.freq(['gender','pu_stream'])

Unnamed: 0,gender,pu_stream,count,percent,cumulative_count,cumulative_percent
0,M,Science,108,32.53012,108,32.53012
1,M,Commerce,90,27.108434,198,59.638554
2,F,Science,69,20.783133,267,80.421687
3,F,Commerce,50,15.060241,317,95.481928
4,M,Arts,9,2.710843,326,98.192771
5,F,Arts,6,1.807229,332,100.0


In [12]:
obj_columns = [x for x in d1.select_dtypes('object').columns]
obj_columns

['gender', 'pu_stream', 'deg_stream', 'workex', 'status']

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
for col in obj_columns:
    d1[col] = le.fit_transform(d1[col])
    
d1

Unnamed: 0,gender,tenth_marks,pu_marks,pu_stream,deg_marks,deg_stream,workex,status
0,1,58.0,70.00,1,61.00,0,0,0
1,1,58.0,61.00,1,60.00,0,1,1
2,1,69.6,68.40,1,78.30,0,1,1
3,0,47.0,55.00,2,65.00,0,0,0
4,0,77.0,87.00,1,59.00,0,0,1
...,...,...,...,...,...,...,...,...
327,1,81.1,78.60,0,35.84,2,0,0
328,1,81.3,85.12,0,68.50,2,0,0
329,1,83.5,89.90,2,76.30,2,0,0
330,1,85.5,90.63,2,74.50,2,0,0


# 1. Logistic Regression (Linear)

In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=100)
from sklearn.model_selection import train_test_split
X = d1.drop('status', axis='columns')
y = d1['status']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state=10)

In [17]:
lr_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred = lr_model.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred)

0.8339622641509434

In [19]:
y_pred = lr_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7910447761194029

In [20]:
joblib.dump(lr_model,"model.joblib")

['model.joblib']

In [21]:
print(X_test)

     gender  tenth_marks  pu_marks  pu_stream  deg_marks  deg_stream  workex
148       0        77.00     86.00          0      56.00           1       0
147       1        70.00     74.00          1      65.00           0       0
154       1        53.00     63.00          2      60.00           0       1
324       1        81.00     73.00          2      42.17           2       0
105       1        59.00     64.00          2      58.00           2       0
..      ...          ...       ...        ...        ...         ...     ...
25        0        52.58     54.60          1      50.20           0       1
211       1        58.00     60.00          2      72.00           2       0
301       0        50.66     54.20          1      60.53           2       0
59        1        52.60     65.58          2      72.11           2       0
37        0        79.00     76.00          2      65.60           2       0

[67 rows x 7 columns]
