In [4]:
# Import libraries and methods/functions
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression

# Start your code here!
# Load data
file = pd.read_csv('telecom_demographics.csv')
file2 = pd.read_csv('telecom_usage.csv')

# Join data
churn_df = file.merge(file2, on='customer_id')

# Identify churn rate
churn_rate = churn_df['churn'].value_counts() / len(churn_df)
print(churn_rate)

# Identify categorical variables
print(churn_df.info())#in simple words we can say it is going to check whether the data is in numeric format or in any other format
#For this we use hotline encoding,label encoding etc
# One Hot Encoding for categorical variables
churn_df = pd.get_dummies(churn_df, columns=['telecom_partner', 'gender', 'state', 'city', 'registration_event'])
# Feature Scaling
scaler = StandardScaler()
# drop customer id as it is not a feature
features = churn_df.drop(['customer_id', 'churn'], axis=1)
features_scaled = scaler.fit_transform(features)

# Target variable
target = churn_df['churn']
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Instantiate the Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

# Logistic Regression predictions
logreg_pred = logreg.predict(X_test)

# Logistic Regression evaluation
print("Logical Regression Reports:")
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, logreg_pred))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_pred))

# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Random Forest predictions
rf_pred = rf.predict(X_test)

# Random Forest evaluation
print("Random Forest Report:")
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# Determine which model has higher accuracy
logreg_accuracy = logreg.score(X_test, y_test)
rf_accuracy = rf.score(X_test, y_test)

higher_accuracy = "Logistic Regression" if logreg_accuracy > rf_accuracy else "Random Forest"
print(f'Model with Higher Accuracy: {higher_accuracy}')

Random Forest Confusion Matrix:
[[1026    1]
 [ 273    0]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.39      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300

Model with Higher Accuracy: Random Forest
