In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

##### Load the dataset

In [2]:
df = pd.read_csv('data.csv')

In [3]:
null_counts = df.isnull().sum()
print("Null/Empty counts for each column:")
print(null_counts)

Null/Empty counts for each column:
timestamp_call_key                        0
retailer_code                             0
serial                                    0
reason                                    0
mos                                       0
resolved                                  0
no_of_accounts_with_syf_13_march       3400
account_balance_13_march             113782
delinquency_history_13_march              0
account_open_date_13_march                0
account_status_13_march                   0
card_activation_status_13_march           0
eservice_ind_13_march                     0
ebill_enrolled_status_13_march            0
auto_pay_enrolled_status_13_march         0
no_of_accounts_with_syf_18_march       3402
account_balance_18_march             101829
delinquency_history_18_march              0
account_open_date_18_march                0
account_status_18_march                   0
card_activation_status_18_march           0
eservice_ind_18_march                    

In [4]:
df.columns

Index(['timestamp_call_key', 'retailer_code', 'serial', 'reason', 'mos',
       'resolved', 'no_of_accounts_with_syf_13_march',
       'account_balance_13_march', 'delinquency_history_13_march',
       'account_open_date_13_march', 'account_status_13_march',
       'card_activation_status_13_march', 'eservice_ind_13_march',
       'ebill_enrolled_status_13_march', 'auto_pay_enrolled_status_13_march',
       'no_of_accounts_with_syf_18_march', 'account_balance_18_march',
       'delinquency_history_18_march', 'account_open_date_18_march',
       'account_status_18_march', 'card_activation_status_18_march',
       'eservice_ind_18_march', 'ebill_enrolled_status_18_march',
       'auto_pay_enrolled_status_18_march', 'date_of_call', 'time_of_call'],
      dtype='object')

In [5]:
df.head(50).to_csv('short_data.csv', index=False)

Starting codes and their counts

In [None]:
# Split the 'mos' column by spaces and get the first element of each split to find the starting codes
starting_codes = df['cleaned_mos'].str.split().str[0]

# Count the occurrences of each starting code
starting_codes_count = starting_codes.value_counts()

print("Starting codes and their counts:")
print(starting_codes_count)


In [None]:
# Filter rows where 'mos' starts with "mn" and extract the next two codes
mn_sequences = df['cleaned_mos'][df['cleaned_mos'].str.startswith('mn')].str.split().str[1:3]

# Convert the list of codes to a string for easier counting
mn_sequences_str = mn_sequences.apply(lambda x: ' '.join(x))

# Count unique sequences and their occurrences
mn_unique_counts = mn_sequences_str.value_counts()

print("Unique cases where 'mos' starts with 'mn' and their counts:")
print(mn_unique_counts)
print(f"Total unique cases: {mn_unique_counts.count()}")

In [None]:
# Filter rows where 'mos' starts with "mn mn RS"
mn_mn_rs_rows = df[df['mos'].str.startswith('mn mn RS')]

# Save these rows to a CSV file
mn_mn_rs_rows.to_csv('mn_mn_rs_rows.csv', index=False)

print(f"Saved {len(mn_mn_rs_rows)} rows to 'mn_mn_rs_rows.csv'.")

In [None]:
# Filter rows where 'mos' starts with "mn mn mn"
mn_mn_mn_rows = df[df['mos'].str.startswith('mn mn mn')]

# Save these rows to a CSV file
mn_mn_mn_rows.to_csv('mn_mn_mn_rows.csv', index=False)

print(f"Saved {len(mn_mn_mn_rows)} rows to 'mn_mn_mn_rows.csv'.")

In [None]:
# Filter rows where 'mos' starts with "mm" and extract the next two codes
mm_sequences = df['mos'][df['mos'].str.startswith('mm')].str.split().str[1:3]

# Convert the list of codes to a string for easier counting
mm_sequences_str = mm_sequences.apply(lambda x: ' '.join(x))

# Count unique sequences and their occurrences
mm_unique_counts = mm_sequences_str.value_counts()

print("Unique cases where 'mos' starts with 'mm' and their counts:")
print(mm_unique_counts)
print(f"Total unique cases: {mm_unique_counts.count()}")


In [None]:
# Filter rows where 'mos' starts with "mm" followed by "mm mm"
mm_mm_mm_rows = df[df['mos'].str.startswith('mm mm mm')]

# Save these rows to a CSV file
mm_mm_mm_rows.to_csv('mm_mm_mm_rows.csv', index=False)

print(f"Saved {len(mm_mm_mm_rows)} rows to 'mm_mm_mm_rows.csv'.")

Count of "IA" followed by "BA"

In [None]:
# Filter rows where 'mos' has "IA BA" sequence and count them
ia_ba_count = df[df['cleaned_mos'].str.contains('IA BA')].shape[0]

print(f'"IA" followed by "BA" occurs {ia_ba_count} times.')


Rows containing IA, BA, and TR in any order

In [None]:
# Using regular expressions to find rows where IA, BA, and TR appear in any order
rows_with_ia_ba_tr = df[df['mos'].str.contains('IA') & df['mos'].str.contains('BA') & df['cleaned_mos'].str.contains('TR')].shape[0]

print(f'Rows containing IA, BA, and TR in any order: {rows_with_ia_ba_tr}')


Rows ending with TR, divided by "resolved" value

In [None]:
# Count of rows ending with TR and resolved status
rows_ending_with_tr_resolved = df[(df['mos'].str.endswith('TR')) & (df['resolved'] == 'resolved')].shape[0]

# Count of rows ending with TR and floor status
rows_ending_with_tr_floor = df[(df['mos'].str.endswith('TR')) & (df['resolved'] == 'floor')].shape[0]

print(f'Rows ending with TR and resolved: {rows_ending_with_tr_resolved}')
print(f'Rows ending with TR and floor: {rows_ending_with_tr_floor}')


Count of Resolved values

In [None]:
# Count the occurrences of each unique value in the 'resolved' column
resolved_counts = df['resolved'].value_counts()

print("Counts of unique values in the 'resolved' column:")
print(resolved_counts)

### Analyzing most impactful codes

In [38]:
def clean_mos_sequence(mos):
    # Split the sequence into codes
    codes = mos.split()
    
    # Remove everything up to the first "IA" including that "IA"
    if "IA" in codes:
        first_ia_index = codes.index("IA") + 1  # Move past the first "IA"
        codes = codes[first_ia_index:]
    
    # Now, remove consecutive "IAs" that follow immediately after
    while codes and codes[0] == "IA":
        codes.pop(0)  # Keep removing "IA" until we encounter a different code
    
    # Remove all "BA" and "TR" codes from what remains
    codes = [code for code in codes if code not in ("BA", "TR")]
    
    return ' '.join(codes)

# Apply the adjusted cleaning function to the 'mos' column
df['cleaned_mos'] = df['mos'].apply(clean_mos_sequence)


Logistic Regression

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

# Assuming df and clean_mos_sequence_v2 as before and df['cleaned_mos_v2'] is ready

# Encoding the 'cleaned_mos_v2' sequences
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)

# Encoding the 'resolved' column (1 for 'floor', 0 otherwise)
y = (df['resolved'] == 'floor').astype(int)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

# Creating a DataFrame to display feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sorting features by absolute importance
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance_sorted = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

# Displaying the top influential features
print(feature_importance_sorted)




   Feature  Coefficient  Absolute Coefficient
39      RS    10.079166             10.079166
47      TN    -5.391877              5.391877
42      TB    -5.041593              5.041593
29      PC     4.310617              4.310617
36      RC     4.281609              4.281609
..     ...          ...                   ...
59      eS     0.045563              0.045563
30      PI    -0.044125              0.044125
69      iT     0.024342              0.024342
56      eB    -0.021953              0.021953
48      TP    -0.009868              0.009868

[78 rows x 3 columns]


In [48]:
feature_importance_sorted.to_csv('logistic_features.csv', index=False)

ML - XGBoost

In [50]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming the data preparation steps are the same
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Extract feature importance
feature_importances = xgb_model.feature_importances_
feature_names = vectorizer.get_feature_names_out()
ml_feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
ml_feature_importance_df = ml_feature_importance_df.sort_values(by='Importance', ascending=False)

print(ml_feature_importance_df)



   Feature  Importance
75      mt    0.313048
39      RS    0.142852
35      Pd    0.055902
33      PT    0.055104
14      FI    0.038958
..     ...         ...
56      eB    0.000000
27      OC    0.000000
51      Te    0.000000
45      TE    0.000000
62      iF    0.000000

[78 rows x 2 columns]


ML - Linear Support Vector Machine (SVM) with Linear Kernel

In [52]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = LinearSVC(max_iter=10000)
svm_model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = svm_model.coef_[0]
feature_importance_svm = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance_svm['Absolute Coefficient'] = feature_importance_svm['Coefficient'].abs()
feature_importance_svm_sorted = feature_importance_svm.sort_values(by='Absolute Coefficient', ascending=False)

print(feature_importance_svm_sorted.head(20))




   Feature  Coefficient  Absolute Coefficient
39      RS     2.589569              2.589569
47      TN    -1.857296              1.857296
10      DE     1.735202              1.735202
42      TB    -1.667842              1.667842
4       BL     1.517775              1.517775
36      RC     1.513772              1.513772
14      FI     1.468856              1.468856
15      FM     1.456148              1.456148
29      PC     1.453462              1.453462
66      iP     1.447207              1.447207
9       CT     1.419551              1.419551
25      NU     1.406764              1.406764
44      TD    -1.397162              1.397162
68      iS     1.357198              1.357198
43      TC    -1.301690              1.301690
21      LS     1.284685              1.284685
71      me     1.281550              1.281550
8       CD     1.246708              1.246708
35      Pd     1.234342              1.234342
46      TL    -1.228536              1.228536


In [53]:
feature_importance_svm_sorted.to_csv('svm_features.csv', index=False)

Ridge Classifier (with L2 Regularization)

In [54]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = ridge_model.coef_[0]
feature_importance_ridge = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance_ridge['Absolute Coefficient'] = feature_importance_ridge['Coefficient'].abs()
feature_importance_ridge_sorted = feature_importance_ridge.sort_values(by='Absolute Coefficient', ascending=False)

print(feature_importance_ridge_sorted.head(20))




   Feature  Coefficient  Absolute Coefficient
39      RS     2.026062              2.026062
47      TN    -1.779387              1.779387
42      TB    -1.631254              1.631254
10      DE     1.562741              1.562741
4       BL     1.497005              1.497005
14      FI     1.473131              1.473131
66      iP     1.452929              1.452929
29      PC     1.452430              1.452430
36      RC     1.449985              1.449985
15      FM     1.419483              1.419483
68      iS     1.388167              1.388167
44      TD    -1.387542              1.387542
43      TC    -1.277350              1.277350
21      LS     1.266163              1.266163
71      me     1.263555              1.263555
8       CD     1.239214              1.239214
35      Pd     1.228700              1.228700
46      TL    -1.214026              1.214026
60      eY     1.141885              1.141885
20      LC     1.063283              1.063283


In [55]:
feature_importance_ridge_sorted.to_csv('ridge_features.csv', index=False)

In [57]:
import pandas as pd

# Load the CSV files into DataFrames
df_logreg = pd.read_csv('logistic_features.csv')
df_svm = pd.read_csv('svm_features.csv')
df_ridge = pd.read_csv('ridge_features.csv')

# Normalize the Coefficients
df_logreg['Normalized Coefficient'] = df_logreg['Coefficient'] / df_logreg['Absolute Coefficient'].max()
df_svm['Normalized Coefficient'] = df_svm['Coefficient'] / df_svm['Absolute Coefficient'].max()
df_ridge['Normalized Coefficient'] = df_ridge['Coefficient'] / df_ridge['Absolute Coefficient'].max()

# Rank the Features based on Absolute Normalized Coefficients
df_logreg['Rank'] = df_logreg['Absolute Coefficient'].rank(method='min', ascending=False)
df_svm['Rank'] = df_svm['Absolute Coefficient'].rank(method='min', ascending=False)
df_ridge['Rank'] = df_ridge['Absolute Coefficient'].rank(method='min', ascending=False)

# Merge the dataframes on 'Feature', including normalized coefficients
df_merged = pd.merge(df_logreg[['Feature', 'Rank', 'Normalized Coefficient']], df_svm[['Feature', 'Rank', 'Normalized Coefficient']], on='Feature', how='inner', suffixes=('_logreg', '_svm'))
df_merged = pd.merge(df_merged, df_ridge[['Feature', 'Rank', 'Normalized Coefficient']], on='Feature', how='inner', suffixes=('', '_ridge'))

# Calculate Average Rank and Average Coefficient
df_merged['Average Rank'] = df_merged[['Rank_logreg', 'Rank_svm', 'Rank']].mean(axis=1)
df_merged['Average Coefficient'] = df_merged[['Normalized Coefficient_logreg', 'Normalized Coefficient_svm', 'Normalized Coefficient']].mean(axis=1)

# Prepare the final dataframe
df_final = df_merged[['Feature', 'Average Rank', 'Average Coefficient']].sort_values(by='Average Rank')

# Saving the "median" list with average coefficient to a CSV
df_final.to_csv('features_median.csv', index=False)
