<a href="https://colab.research.google.com/github/prathyushalahari/Machine-Learning-Model-Implementations/blob/main/Linear_and_Logistic_Regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas numpy scikit-learn matplotlib seaborn




In [12]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
url = "https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset/download"
data = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

# Explore the dataset
print(data.head())
print(data.info())
print(data.describe())


   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [13]:
# Check for missing values
print(data.isnull().sum())


Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [15]:
# Example: Fill missing values with the mean for numeric columns only
numeric_cols = data.select_dtypes(include=np.number).columns  # Select numeric columns
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean()) # Fill NaNs in numeric cols with their mean

In [22]:
print(data)

      Age Attrition     BusinessTravel  DailyRate              Department  \
0      41       Yes      Travel_Rarely       1102                   Sales   
1      49        No  Travel_Frequently        279  Research & Development   
2      37       Yes      Travel_Rarely       1373  Research & Development   
3      33        No  Travel_Frequently       1392  Research & Development   
4      27        No      Travel_Rarely        591  Research & Development   
...   ...       ...                ...        ...                     ...   
1465   36        No  Travel_Frequently        884  Research & Development   
1466   39        No      Travel_Rarely        613  Research & Development   
1467   27        No      Travel_Rarely        155  Research & Development   
1468   49        No  Travel_Frequently       1023                   Sales   
1469   34        No      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  EmployeeCount  \
0       

In [23]:
# Check for missing values after conversion
print(data.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [25]:
from sklearn.preprocessing import LabelEncoder

# Convert categorical columns to numeric (for simplicity using LabelEncoder)
label_enc = LabelEncoder()

# Apply Label Encoding for binary categorical variables like 'Attrition'
data['Attrition'] = label_enc.fit_transform(data['Attrition'])

# One-hot encoding for other categorical features like JobRole, Department, etc.
data = pd.get_dummies(data, drop_first=True)

print(data)


      Age  Attrition  DailyRate  DistanceFromHome  Education  EmployeeCount  \
0      41          1       1102                 1          2              1   
1      49          0        279                 8          1              1   
2      37          1       1373                 2          2              1   
3      33          0       1392                 3          4              1   
4      27          0        591                 2          1              1   
...   ...        ...        ...               ...        ...            ...   
1465   36          0        884                23          2              1   
1466   39          0        613                 6          1              1   
1467   27          0        155                 4          3              1   
1468   49          0       1023                 2          3              1   
1469   34          0        628                 8          3              1   

      EmployeeNumber  EnvironmentSatisfaction  Hour

In [28]:
from sklearn.model_selection import train_test_split

# For Linear Regression: Predict 'MonthlyIncome'
X_linear = data.drop(['MonthlyIncome'], axis=1)
y_linear = data['MonthlyIncome']

# For Logistic Regression: Predict 'Attrition'
X_logistic = data.drop(['Attrition'], axis=1)
y_logistic = data['Attrition']

# Split the data into train and test sets
X_train_linear, X_test_linear, y_train_linear, y_test_linear = train_test_split(X_linear, y_linear, test_size=0.2, random_state=42)
X_train_logistic, X_test_logistic, y_train_logistic, y_test_logistic = train_test_split(X_logistic, y_logistic, test_size=0.2, random_state=42)

print(data.head())

   Age  Attrition  DailyRate  DistanceFromHome  Education  EmployeeCount  \
0   41          1       1102                 1          2              1   
1   49          0        279                 8          1              1   
2   37          1       1373                 2          2              1   
3   33          0       1392                 3          4              1   
4   27          0        591                 2          1              1   

   EmployeeNumber  EnvironmentSatisfaction  HourlyRate  JobInvolvement  ...  \
0               1                        2          94               3  ...   
1               2                        3          61               2  ...   
2               4                        4          92               2  ...   
3               5                        4          56               3  ...   
4               7                        1          40               3  ...   

   JobRole_Laboratory Technician  JobRole_Manager  \
0              

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score,adjusted_rand_score

# Initialize Linear Regression model
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train_linear, y_train_linear)

# Make predictions
y_pred_linear = linear_model.predict(X_test_linear)

# Evaluate the model (Mean Squared Error)
mse_linear = mean_squared_error(y_test_linear, y_pred_linear)
mean_absolute_error_linear = mean_absolute_error(y_test_linear, y_pred_linear)
r2_score_linear = r2_score(y_test_linear, y_pred_linear)
r2_percentage_linear = r2_score_linear * 100
r2_score_adjusted_linear = 1 - (1 - r2_score_linear) * (len(y_test_linear) - 1) / (len(y_test_linear) - X_test_linear.shape[1] - 1)
print(f"Linear Regression MSE: {mse_linear}")
print(f"Linear Regression MAE: {mean_absolute_error_linear}")
print(f"Linear Regression R2 Score: {r2_score_linear}")
print(f"R-Squared Percentage : {r2_percentage_linear:.2f}%")
print(f"Linear Regression Adjusted R2 Score: {r2_score_adjusted_linear}")

Linear Regression MSE: 1358868.1737296914
Linear Regression MAE: 891.7550014906446
Linear Regression R2 Score: 0.9378246275615041
R-Squared Percentage : 93.78%
Linear Regression Adjusted R2 Score: 0.9259455929899215
