# Logistic Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values


## Taking care of missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(X[:,6:])
X[:,6:] = imputer.transform(X[:,6:])

imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(X[:,1:6])
X[:,1:6] = imputer.transform(X[:,1:6])

In [None]:
print(X)

[[65438 'Sales & Marketing' 'region_7' ... 1.0 0.0 49.0]
 [65141 'Operations' 'region_22' ... 0.0 0.0 60.0]
 [7513 'Sales & Marketing' 'region_19' ... 0.0 0.0 50.0]
 ...
 [15532 'Procurement' 'region_10' ... 1.0 0.0 71.0]
 [39246 'Technology' 'region_33' ... 0.0 0.0 77.0]
 [42690 'Sales & Marketing' 'region_16' ... 1.0 0.0 96.0]]


### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[0.0 0.0 0.0 ... 1.0 0.0 49.0]
 [0.0 0.0 0.0 ... 0.0 0.0 60.0]
 [0.0 0.0 0.0 ... 0.0 0.0 50.0]
 ...
 [0.0 0.0 0.0 ... 1.0 0.0 71.0]
 [0.0 0.0 0.0 ... 0.0 0.0 77.0]
 [0.0 0.0 0.0 ... 1.0 0.0 96.0]]


In [None]:
for x in range(15):
    print(X[0,x])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
65438
region_7
Master's & above
f
sourcing
1.0


In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [10])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
for x in range(50):
    print(X[0,x])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
65438
Master's & above
f
sourcing
1.0
35.0
5.0


In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [44])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


In [None]:
for x in range(55):
    print(X[0,x])

0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
65438
f
sourcing
1.0
35.0
5.0
8.0
1.0
0.0


In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [47])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
for x in range(50):
    print(X[0,x])

1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
65438
sourcing


In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [49])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
for x in range(55):
    print(X[0,x])

0.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
65438
1.0
35.0
5.0


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(y)
print(X)

[[0.0 0.0 1.0 ... 1.0 0.0 49.0]
 [1.0 0.0 0.0 ... 0.0 0.0 60.0]
 [0.0 0.0 1.0 ... 0.0 0.0 50.0]
 ...
 [1.0 0.0 0.0 ... 1.0 0.0 71.0]
 [1.0 0.0 0.0 ... 0.0 0.0 77.0]
 [0.0 0.0 1.0 ... 1.0 0.0 96.0]]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
print(X_train)

[[0.0 0.0 1.0 ... 0.0 0.0 61.0]
 [0.0 0.0 1.0 ... 0.0 0.0 69.0]
 [0.0 0.0 1.0 ... 1.0 0.0 89.0]
 ...
 [0.0 0.0 1.0 ... 1.0 0.0 49.0]
 [0.0 0.0 1.0 ... 1.0 0.0 79.0]
 [0.0 0.0 1.0 ... 0.0 0.0 81.0]]


In [None]:
print(y_train)

[0 0 1 ... 0 0 0]


In [None]:
print(X_test)

[[1.0 0.0 0.0 ... 0.0 0.0 82.0]
 [0.0 0.0 1.0 ... 1.0 0.0 49.0]
 [1.0 0.0 0.0 ... 0.0 0.0 49.0]
 ...
 [0.0 0.0 1.0 ... 0.0 0.0 58.0]
 [1.0 0.0 0.0 ... 0.0 0.0 48.0]
 [0.0 0.0 1.0 ... 0.0 0.0 79.0]]


In [None]:
print(y_test)

[0 0 0 ... 0 0 0]


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
print(X_train)

[[-1.11615118 -0.14367235  1.16303472 ... -0.73926871 -0.15323435
  -0.17926847]
 [-1.11615118 -0.14367235  1.16303472 ... -0.73926871 -0.15323435
   0.41865178]
 [-1.11615118 -0.14367235  1.16303472 ...  1.35268811 -0.15323435
   1.91345243]
 ...
 [-1.11615118 -0.14367235  1.16303472 ...  1.35268811 -0.15323435
  -1.07614886]
 [-1.11615118 -0.14367235  1.16303472 ...  1.35268811 -0.15323435
   1.16605211]
 [-1.11615118 -0.14367235  1.16303472 ... -0.73926871 -0.15323435
   1.31553217]]


In [None]:
print(X_test)

[[ 0.89593597 -0.14367235 -0.85981956 ... -0.73926871 -0.15323435
   1.3902722 ]
 [-1.11615118 -0.14367235  1.16303472 ...  1.35268811 -0.15323435
  -1.07614886]
 [ 0.89593597 -0.14367235 -0.85981956 ... -0.73926871 -0.15323435
  -1.07614886]
 ...
 [-1.11615118 -0.14367235  1.16303472 ... -0.73926871 -0.15323435
  -0.40348857]
 [ 0.89593597 -0.14367235 -0.85981956 ... -0.73926871 -0.15323435
  -1.15088889]
 [-1.11615118 -0.14367235  1.16303472 ... -0.73926871 -0.15323435
   1.16605211]]


## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Predicting a new result

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[12412    69]
 [  832   301]]


0.9338181283972381