In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
telecom_demographics = pd.read_csv('/content/telecom_demographics.csv')
telecom_usage = pd.read_csv('/content/telecom_usage.csv')

# Display the first few rows and info of the DataFrames
print("Telecom Demographics Data:")
print(telecom_demographics.head())
print(telecom_demographics.info())

print("\nTelecom Usage Data:")
print(telecom_usage.head())
print(telecom_usage.info())

Telecom Demographics Data:
   customer_id telecom_partner gender  age             state       city  \
0        15169          Airtel      F   26  Himachal Pradesh      Delhi   
1       149207          Airtel      F   74       Uttarakhand  Hyderabad   
2       148119          Airtel      F   54         Jharkhand    Chennai   
3       187288    Reliance Jio      M   29             Bihar  Hyderabad   
4        14016        Vodafone      M   45          Nagaland  Bangalore   

   pincode registration_event  num_dependents  estimated_salary  
0   667173         2020-03-16               4             85979  
1   313997         2022-01-16               0             69445  
2   549925         2022-01-11               2             75949  
3   230636         2022-07-26               3             34272  
4   188036         2020-03-11               4             34157  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 10 columns):
 #   Column        

In [10]:
churn_df = pd.merge(telecom_demographics, telecom_usage, on='customer_id')

# Display the first few rows of the merged DataFrame
print(churn_df.head())

   customer_id telecom_partner gender  age             state       city  \
0        15169          Airtel      F   26  Himachal Pradesh      Delhi   
1       149207          Airtel      F   74       Uttarakhand  Hyderabad   
2       148119          Airtel      F   54         Jharkhand    Chennai   
3       187288    Reliance Jio      M   29             Bihar  Hyderabad   
4        14016        Vodafone      M   45          Nagaland  Bangalore   

   pincode registration_event  num_dependents  estimated_salary  calls_made  \
0   667173         2020-03-16               4             85979          75   
1   313997         2022-01-16               0             69445          35   
2   549925         2022-01-11               2             75949          70   
3   230636         2022-07-26               3             34272          95   
4   188036         2020-03-11               4             34157          66   

   sms_sent  data_used  churn  
0        21       4532      1  
1        3

In [11]:
churn_rate = churn_df['churn'].mean()
print(f'Churn Rate: {churn_rate:.2f}')

Churn Rate: 0.20


In [12]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical features
categorical_features = ['telecom_partner', 'gender', 'state', 'city']

# Convert 'registration_event' to datetime and then to numerical value (e.g., number of days since the earliest date)
churn_df['registration_event'] = pd.to_datetime(churn_df['registration_event'])
churn_df['registration_event'] = (churn_df['registration_event'] - churn_df['registration_event'].min()).dt.days

# One-hot encode the categorical features
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')
categorical_encoded = one_hot_encoder.fit_transform(churn_df[categorical_features])

# Convert the encoded categorical features into a DataFrame
categorical_encoded_df = pd.DataFrame(categorical_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_features))

# Drop the original categorical columns and concatenate the encoded columns
churn_df = churn_df.drop(columns=categorical_features)
churn_df = pd.concat([churn_df, categorical_encoded_df], axis=1)



In [13]:
features_to_scale = ['age', 'num_dependents', 'estimated_salary', 'calls_made', 'sms_sent', 'data_used']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
scaled_features = scaler.fit_transform(churn_df[features_to_scale])

# Convert the scaled features into a DataFrame
scaled_features_df = pd.DataFrame(scaled_features, columns=features_to_scale)

# Drop the original columns and concatenate the scaled columns
churn_df = churn_df.drop(columns=features_to_scale)
churn_df = pd.concat([churn_df, scaled_features_df], axis=1)

In [14]:
target = 'churn'

# Define the feature set
features = churn_df.drop(columns=[target])

# Define the target variable
target_variable = churn_df[target]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, target_variable, test_size=0.2, random_state=42)

In [16]:
from sklearn.linear_model import LogisticRegression

# Initialize the models
logreg_model = LogisticRegression(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# Train the models
logreg_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predict on the test data
logreg_pred = logreg_model.predict(X_test)
rf_pred = rf_model.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score

# Calculate accuracy scores
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

# Determine which model has higher accuracy
higher_accuracy = "LogisticRegression" if logreg_accuracy > rf_accuracy else "RandomForest"

# Print the accuracy scores and the model with higher accuracy
print(f'Logistic Regression Accuracy: {logreg_accuracy:.2f}')
print(f'Random Forest Accuracy: {rf_accuracy:.2f}')
print(f'Higher Accuracy Model: {higher_accuracy}')

# Print classification reports
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, logreg_pred))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# Print confusion matrices
print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, logreg_pred))

print("\nRandom Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))

Logistic Regression Accuracy: 0.79
Random Forest Accuracy: 0.79
Higher Accuracy Model: RandomForest

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1027
           1       0.00      0.00      0.00       273

    accuracy                           0.79      1300
   macro avg       0.40      0.50      0.44      1300
weighted avg       0.62      0.79      0.70      1300


Logistic Regression Confusion Matrix:
[[1027    0]
 [ 273    0]]

Random Forest Confusion Matrix:
[[1027    0]
 [ 273    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
