## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.regressionplots import influence_plot
import statsmodels.formula.api as smf
sns.set_theme(style='darkgrid',palette = 'rainbow')
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Titanic_train.csv')
df.head()

## EDA

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

## Visualization

In [None]:
df_new=['PassengerId','Survived','Pclass','Age','SibSp','Parch','Fare']

In [None]:
for col in df_new:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col],kde=True)

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(df)
plt.title("Titanic")

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(df.corr(numeric_only=True),annot=True)

## Data Preprocessing

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
df.drop(columns=['Cabin'],inplace=True)

In [None]:
df['Age'] = df['Age'].fillna(df['Age'].median())

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder=LabelEncoder()
for columns in df.columns:
    df[columns]=encoder.fit_transform(df[columns])

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df,columns=['Sex','Embarked'],dtype=int)
df

## Now cleaning the Test Data set

In [None]:
data = pd.read_csv('Titanic_test.csv')
data

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum()/data.shape[0]*100

In [None]:

data['Age'] = data['Age'].fillna(data['Age'].median())
data

In [None]:

data.drop(columns=['Cabin'],inplace=True)
data.head()

In [None]:
encoder=LabelEncoder()
for columns in data.columns:
    data[columns]=encoder.fit_transform(data[columns])

In [None]:
data = pd.get_dummies(data,columns=['Sex','Embarked'],dtype=int)
data.head()

## Data Preparation

In [None]:
x = df.drop('Survived',axis=1)
y = df['Survived']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=1)

## Model Building

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logomodel = LogisticRegression()
logomodel.fit(xtrain,ytrain)

In [None]:
logomodel.score(xtrain,ytrain)

In [None]:
logomodel.score(xtest,ytest)

## Making Prediction

In [None]:
predictions = logomodel.predict(xtest)

In [None]:
final_prediction = pd.DataFrame({'PassengerId':xtest['PassengerId'],'Survived':predictions})

In [None]:
final_prediction

## Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [None]:
y_predict_train=logomodel.predict(xtrain)

In [None]:
accuracy_score=accuracy_score(ytrain,y_predict_train)

In [None]:
y_predict_test=logomodel.predict(xtest)

In [None]:
print(accuracy_score)

In [None]:
confusion_matrix(ytrain,y_predict_train)

In [None]:
sns.heatmap(confusion_matrix(ytrain,y_predict_train), annot=True,fmt='g')

In [None]:
print(classification_report(ytrain,y_predict_train))

In [None]:
ytest

In [None]:
y_predict_test=logomodel.predict(xtest)


In [None]:
confusion_matrix(ytest,y_predict_test)

In [None]:
sns.heatmap(confusion_matrix(ytest,y_predict_test), annot=True,fmt='g')

In [None]:
print(accuracy_score)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
logomodel = LogisticRegression()
logomodel.fit(xtrain, ytrain)

In [None]:
y_pred = logomodel.predict(xtest)

In [None]:
precision = precision_score(ytest, y_pred)
recall = recall_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)

In [None]:
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt
y_prob = logomodel.predict_proba(xtest)[:, 1]
fpr, tpr, thresholds = roc_curve(ytest, y_prob)
roc_auc = auc(fpr, tpr)
roc_auc_score_value = roc_auc_score(ytest, y_prob)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()
print("AUC Score:", roc_auc_score_value)


import matplotlib.pyplot as plt
plt.plot(fpr, tpr, color='red', label='logit model ( area  = %0.2f)'%auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
plt.ylabel('True Positive Rate')

In [None]:
coafficiants = logomodel.coef_[0] 

intercept = logomodel.intercept_[0]

## Interpretaion

In [None]:
coefficients = pd.DataFrame({
    'Feature': x.columns,
    'Coefficient': logomodel.coef_[0],
    'Odds Ratio': [round(np.exp(coef), 2) for coef in logomodel.coef_[0]]
})
print(coefficients)

## Model Deployment

In [None]:
#! pip install streamlit

In [1]:
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import os
import warnings
warnings.filterwarnings("ignore")
print("Working directory:", os.getcwd())


X, y = make_classification(
    n_samples=100,
    n_features=7,
    n_informative=2,
    n_redundant=0,  
    random_state=42
)


model = LogisticRegression()
model.fit(X, y)

# Save model
with open('logomodel.pkl', 'wb') as f:
    pickle.dump(model, f)

print(" Model saved as 'logomodel.pkl'")

Working directory: C:\Users\pawar.SARPANCH\Assignment Data Science
 Model saved as 'logomodel.pkl'


In [4]:
import streamlit as st
import numpy as np
import pickle

# Load the trained model
with open('logomodel.pkl', 'rb') as f:
    model = pickle.load(f)

# Streamlit App Title and Description
st.title("Employee Performance Prediction")
st.markdown("Provide the employee details below to predict their performance outcome using Logistic Regression.")

# User Input Fields with Descriptive Labels
age = st.number_input("Employee Age (Years)", min_value=18, max_value=65)
monthly_income = st.number_input("Monthly Income (in ₹)", min_value=0)
experience_years = st.number_input("Total Years of Experience", min_value=0.0)
projects_handled = st.number_input("Number of Completed Projects", min_value=0)
training_score = st.slider("Training Assessment Score (%)", 0, 100)
certifications = st.number_input("Number of Professional Certifications", min_value=0)
working_hours_per_week = st.number_input("Average Weekly Working Hours", min_value=0, max_value=100)

# Prepare input for prediction
if st.button("Predict"):
    input_data = np.array([[age, monthly_income, experience_years, projects_handled,
                            training_score, certifications, working_hours_per_week]])
    prediction = model.predict(input_data)

    # Display prediction result
    st.subheader(" Prediction Result")
    st.success(f"Predicted Performance Outcome: {prediction[0]}")




In [5]:
%%writefile model.py
import streamlit as st
st.title("Test Streamlit App")
st.write("This is a simple app to verify Streamlit setup.")

Overwriting model.py


In [None]:
!streamlit run model.py

## Interview Question

1.What is the difference between precision and recall?

Precision and Recall are two important metrics used to evaluate the performance of classification models:

Precision answers the question: "Of all the positive predictions made by the model, how many are actually correct?" For example, if a model predicts 10 emails as spam but only 6 are truly spam, the precision is 60%. It focuses on avoiding false alarms (false positives). This is useful when making a wrong positive prediction is costly, like flagging important emails as spam.

Recall answers the question: "Of all the actual positive cases, how many did the model correctly identify?" For example, if there are 10 spam emails and the model only identifies 7 of them as spam, the recall is 70%. It focuses on not missing any true cases (false negatives). This is critical in situations like detecting diseases where missing a case can have serious consequences.


2.What is cross-validation, and why is it important in binary classification?

Cross-validation is a technique used to evaluate the performance of a model by splitting the data into multiple subsets and training/testing the model on different combinations of these subsets.

Prevents Overfitting: By training and testing the model on different subsets, cross-validation ensures the model doesn’t memorize the training data but learns general patterns.

Estimates Model Performance: It helps assess how the model will perform on unseen data by evaluating it on multiple splits of the dataset.

Handles Imbalanced Data: In binary classification, where one class may be underrepresented, cross-validation helps in understanding model performance across different class distributions.

Reduces Bias: Cross-validation gives a more robust estimate of model performance by reducing the variability that may occur if the model is trained/tested on a single split of data.