<a href="https://colab.research.google.com/github/priyankajuttu/FUTURE_ML_02/blob/main/Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Update the file path below to the correct uploaded file location
file_path = '/content/Spotify_data.xlsx'  # Replace with your actual file name

# Load the Excel file
data = pd.read_excel(file_path)

# Display first few rows and info
print(data.head())
print(data.info())


In [None]:
# Fill missing values in selected categorical columns with 'Unknown'
cols_with_missing = ['preffered_premium_plan', 'fav_pod_genre', 'preffered_pod_format', 'pod_host_preference', 'preffered_pod_duration']
data[cols_with_missing] = data[cols_with_missing].fillna('Unknown')

# One-hot encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print(data_encoded.head())


In [None]:
# Define churn based on usage period and premium subscription willingness
data['churn'] = ((data['spotify_usage_period'] == 'Less than 6 months') |
                 (data['premium_sub_willingness'] == 'No')).astype(int)

# Check the churn value counts
print(data['churn'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Features and target
X = data_encoded.drop(columns=['churn'])
y = data_encoded['churn']

# Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n")

# Evaluate all models
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")


In [None]:
# 1. Create churn column
data['churn'] = ((data['spotify_usage_period'] == 'Less than 6 months') |
                 (data['premium_sub_willingness'] == 'No')).astype(int)

# 2. Fill missing values as before (if not done)
# ... (your missing value handling code here)

# 3. One-hot encode ALL columns, including churn
data_encoded = pd.get_dummies(data, columns=data.select_dtypes(include=['object']).columns, drop_first=True)

# 4. Now split features and target
X = data_encoded.drop(columns=['churn'])
y = data_encoded['churn']


In [None]:
# Step 1: Add or update churn column first!
data['churn'] = ((data['spotify_usage_period'] == 'Less than 6 months') |
                 (data['premium_sub_willingness'] == 'No')).astype(int)

# Step 2: Handle missing values as before
cols_with_missing = ['preffered_premium_plan', 'fav_pod_genre', 'preffered_pod_format', 'pod_host_preference', 'preffered_pod_duration']
data[cols_with_missing] = data[cols_with_missing].fillna('Unknown')

# Step 3: One-hot encode (including churn)
data_encoded = pd.get_dummies(data, columns=data.select_dtypes(include=['object']).columns, drop_first=True)

# Step 4: Split into features and target
X = data_encoded.drop(columns=['churn'])
y = data_encoded['churn']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Features and target
X = data_encoded.drop(columns=['churn'])
y = data_encoded['churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n")

# Evaluate all models
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_xgb, "XGBoost")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Get feature importance from XGBoost (best model)
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = X.columns

# Plot top 10 important features
plt.figure(figsize=(10, 6))
plt.title("Top 10 Feature Importances (XGBoost)")
plt.bar(range(10), importances[indices][:10], align="center")
plt.xticks(range(10), [feature_names[i] for i in indices[:10]], rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
# Export feature importances
import pandas as pd
import numpy as np

fi_df = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb.feature_importances_
}).sort_values(by='importance', ascending=False)
fi_df.to_csv('feature_importance.csv', index=False)

# Export model metrics
metrics = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC'],
    'Logistic_Regression': [0.99, 1.0, 0.98, 0.99, 0.99],
    'Random_Forest': [0.97, 0.97, 0.98, 0.98, 0.97],
    'XGBoost': [1.0, 1.0, 1.0, 1.0, 1.0]
}
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('model_metrics.csv', index=False)

# Export confusion matrix
cm = confusion_matrix(y_test, y_pred_xgb)
cm_df = pd.DataFrame(cm, columns=['Predicted_No','Predicted_Yes'], index=['Actual_No','Actual_Yes'])
cm_df.to_csv('confusion_matrix.csv')

# Export churn probabilities with true/false labels
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_xgb,
    'Churn_Probability': xgb.predict_proba(X_test)[:,1]
})
results_df.to_csv('churn_predictions.csv', index=False)


In [None]:
# Export your complete cleaned dataframe (with churn column)
data.to_csv('cleaned_data.csv', index=False)
