In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix


In [84]:
file_path = '/content/SILKYSKY_DATA_CW2 (S).csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

In [None]:
data.head()

In [86]:


# Convert 'Satisfied' column to numerical values
data['Satisfied'] = data['Satisfied'].map({'Y': 1, 'N': 0})


In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.describe()


In [None]:
data.isnull().sum()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns # Added import for seaborn

# Drop rows with missing values (if any)
data.dropna(inplace=True)

# Explore data distribution for numerical features
numerical_features = data.select_dtypes(include=np.number).columns
for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  sns.histplot(data[feature], kde=True)
  plt.title(f'Distribution of {feature}')
  plt.show()

# Explore data distribution for categorical features
categorical_features = data.select_dtypes(include='object').columns
for feature in categorical_features:
  plt.figure(figsize=(8, 6))
  sns.countplot(x=feature, data=data)
  plt.title(f'Distribution of {feature}')
  plt.xticks(rotation=45)
  plt.show()

# Analyze correlation between features
correlation_matrix = data.corr(numeric_only=True) # Added numeric_only=True
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Explore relationship between features and target variable
for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  sns.boxplot(x='Satisfied', y=feature, data=data)
  plt.title(f'Relationship between {feature} and Satisfaction')
  plt.show()

for feature in categorical_features:
  plt.figure(figsize=(8, 6))
  sns.countplot(x=feature, hue='Satisfied', data=data)
  plt.title(f'Relationship between {feature} and Satisfaction')
  plt.xticks(rotation=45)
  plt.show()

In [93]:
data = pd.get_dummies(data, drop_first=True)


In [96]:
#Train test Split
X = data.drop('Satisfied', axis=1)  # Replace 'target_column' with your actual target column
y = data['Satisfied']




In [98]:
#  train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

import matplotlib.pyplot as plt
# Install xgboost if not already installed
!pip install xgboost

from xgboost import XGBClassifier

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc}')

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:


import matplotlib.pyplot as plt
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred_rf))

# Calculate ROC AUC score
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)
print(f'ROC AUC (Random Forest): {roc_auc_rf}')

# Plot ROC curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_pred_rf)
plt.plot(fpr_rf, tpr_rf)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (Random Forest)')
plt.show()

# Generate confusion matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Random Forest)')
plt.show()


In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming you have already trained your XGBoost model and have y_test and y_pred

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("XGBoost Classifier Evaluation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)


In [None]:

from sklearn.metrics import roc_auc_score

# Assuming you have already trained your XGBoost model and have y_test and y_pred

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)

# Print the ROC AUC score
print("ROC AUC Score for XGBoost Classifier:", roc_auc)


In [None]:
# Evaluation Metrics for Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


In [None]:


# Calculate ROC AUC score for Random Forest
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

# Print the ROC AUC score for Random Forest
print("ROC AUC Score for Random Forest Classifier:", roc_auc_rf)


In [None]:

import matplotlib.pyplot as plt
# Generate confusion matrix for XGBoost
cm_xgb = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (XGBoost)')
plt.show()

# Generate confusion matrix for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Random Forest)')
plt.show()


In [None]:

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Generate ROC curve for XGBoost
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_test, xgb_model.predict_proba(X_test)[:, 1])
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

# Generate ROC curve for Random Forest
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curves for both models
plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2, label='XGBoost (AUC = %0.2f)' % roc_auc_xgb)
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
# Get feature importances from the trained Random Forest model
importances = rf_model.feature_importances_

# Create a DataFrame to store feature names and their importances
feature_importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)

# Print the feature importances
print(feature_importances_df)

# Visualize feature importances using a bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances_df)
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
# Get feature importances from the trained XGBoost model
importances = xgb_model.feature_importances_

# Create a DataFrame to store feature names and their importances
feature_importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})

# Sort the DataFrame by importance in descending order
feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)

# Print the feature importances
print(feature_importances_df)

# Visualize feature importances using a bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances_df)
plt.title('XGBoost Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
# Analyze model performance using different metrics
def analyze_model(model, model_name, X_test, y_test):
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  roc_auc = roc_auc_score(y_test, y_pred)

  print(f"\n{model_name} Model Performance:")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1-Score:", f1)
  print("ROC AUC:", roc_auc)


# Analyze both XGBoost and Random Forest models
analyze_model(xgb_model, "XGBoost", X_test, y_test)
analyze_model(rf_model, "Random Forest", X_test, y_test)


# Compare feature importances
def compare_feature_importance(model1, model2, model_name1, model_name2, X_train):
  importances1 = model1.feature_importances_
  importances2 = model2.feature_importances_

  feature_importances_df1 = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances1, 'Model': model_name1})
  feature_importances_df2 = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances2, 'Model': model_name2})

  feature_importances_df = pd.concat([feature_importances_df1, feature_importances_df2])

  plt.figure(figsize=(12, 8))
  sns.barplot(x='Importance', y='Feature', hue='Model', data=feature_importances_df)
  plt.title('Comparison of Feature Importance Between Models')
  plt.xlabel('Importance')
  plt.ylabel('Feature')
  plt.show()

compare_feature_importance(xgb_model, rf_model, "XGBoost", "Random Forest", X_train)
