In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## loading dataset using pandas

In [5]:
data=pd.read_csv("HR_comma_sep.csv")

In [6]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
data.shape

(14999, 10)

##### check the dataset if it has any null values

In [8]:
data.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
sales                    0
salary                   0
dtype: int64

###### using info()  method to get the info about the data  ie number of values their datatype and whether it has nul values or not.

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [10]:
label_encoder=LabelEncoder()

###### converted the categorical data to numerical data using labelencoder

In [11]:
data.sales.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [12]:
data.sales=label_encoder.fit_transform(data.sales)

In [13]:
data.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

In [14]:
data.salary=label_encoder.fit_transform(data.salary)

In [15]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1


In [16]:
y=data['left']
y.unique()

array([1, 0])

In [17]:
X=data.drop(['left'],axis=1)

##### splitting the data into train and test data  for learning purpose and testing the model.

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [19]:
model=LogisticRegression(solver='lbfgs',max_iter=1000)

In [20]:
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
model.score(X_test,y_test)

0.7664646464646465

In [23]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,7,1
1,0.8,0.86,5,262,6,0,1,0,7,2
2,0.11,0.88,7,272,4,0,1,0,7,2
3,0.72,0.87,5,223,5,0,1,0,7,1
4,0.37,0.52,2,159,3,0,1,0,7,1


as the values in the column satsfaction_level and last_evaluation are not in the order of the other entries we can multiply the values in the column by a constant to make it in the order of the column values .

as the columns are a kind of rating multiplying it by 10 won't change the meaning .
previously the ratings were out of 1,now it is out of 10.meaning remains the same.

In [24]:
data.satisfaction_level=data.satisfaction_level*10

In [25]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,3.8,0.53,2,157,3,0,1,0,7,1
1,8.0,0.86,5,262,6,0,1,0,7,2
2,1.1,0.88,7,272,4,0,1,0,7,2
3,7.2,0.87,5,223,5,0,1,0,7,1
4,3.7,0.52,2,159,3,0,1,0,7,1


In [26]:
data.last_evaluation=data.last_evaluation*10

In [27]:
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,3.8,5.3,2,157,3,0,1,0,7,1
1,8.0,8.6,5,262,6,0,1,0,7,2
2,1.1,8.8,7,272,4,0,1,0,7,2
3,7.2,8.7,5,223,5,0,1,0,7,1
4,3.7,5.2,2,159,3,0,1,0,7,1


In [28]:
X=data

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [30]:
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
model.score(X_test,y_test)

1.0

the accuracy of the model depends upon the data as train_test_split performs random splitting of data and will change with different data.

but the feature scaling we did with the two columns will be very useful for the model.