In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

#### Load the dataset

In [2]:
df = pd.read_csv('data.csv')

In [None]:
null_counts = df.isnull().sum()
print("Null/Empty counts for each column:")
print(null_counts)

In [4]:
df.columns

Index(['timestamp_call_key', 'retailer_code', 'serial', 'reason', 'mos',
       'resolved', 'no_of_accounts_with_syf_13_march',
       'account_balance_13_march', 'delinquency_history_13_march',
       'account_open_date_13_march', 'account_status_13_march',
       'card_activation_status_13_march', 'eservice_ind_13_march',
       'ebill_enrolled_status_13_march', 'auto_pay_enrolled_status_13_march',
       'no_of_accounts_with_syf_18_march', 'account_balance_18_march',
       'delinquency_history_18_march', 'account_open_date_18_march',
       'account_status_18_march', 'card_activation_status_18_march',
       'eservice_ind_18_march', 'ebill_enrolled_status_18_march',
       'auto_pay_enrolled_status_18_march', 'date_of_call', 'time_of_call'],
      dtype='object')

### Code-level Importance Analysis

Identifying the "starting" codes and their counts

In [5]:
# Split the 'mos' column by spaces and get the first element of each split to find the starting codes
starting_codes = df['mos'].str.split().str[0]

# Count the occurrences of each starting code
starting_codes_count = starting_codes.value_counts()

print("Starting codes and their counts:")
print(starting_codes_count)


Starting codes and their counts:
mos
IA    1384460
mn     409496
mm       4842
Name: count, dtype: int64


In [6]:
# Filter rows where 'mos' starts with "mn" and extract the next two codes
mn_sequences = df['mos'][df['mos'].str.startswith('mn')].str.split().str[1:3]

# Convert the list of codes to a string for easier counting
mn_sequences_str = mn_sequences.apply(lambda x: ' '.join(x))

# Count unique sequences and their occurrences
mn_unique_counts = mn_sequences_str.value_counts()

print("Unique cases where 'mos' starts with 'mn' and their counts:")
print(mn_unique_counts)
print(f"Total unique cases: {mn_unique_counts.count()}")

Unique cases where 'mos' starts with 'mn' and their counts:
mos
IA PP    176820
IA BA    132921
IA PI     40559
IA IA     22252
RS IA     17087
IA TR      4593
IA Ba      3959
IA DR      3155
mn IA      2069
m- IA      1803
IA CB      1515
IA AA      1220
IA DP       821
IA nl       183
IA FI       179
IA LW        87
mn RS        79
IA mm        69
IA           57
mn mn        43
IA OC        25
Name: count, dtype: int64
Total unique cases: 21


In [None]:
# Filter rows where 'mos' starts with "mn mn RS"
mn_mn_rs_rows = df[df['mos'].str.startswith('mn mn RS')]

# Save these rows to a CSV file
mn_mn_rs_rows.to_csv('mn_mn_rs_rows.csv', index=False)

print(f"Saved {len(mn_mn_rs_rows)} rows to 'mn_mn_rs_rows.csv'.")

In [None]:
# Filter rows where 'mos' starts with "mn mn mn"
mn_mn_mn_rows = df[df['mos'].str.startswith('mn mn mn')]

# Save these rows to a CSV file
mn_mn_mn_rows.to_csv('mn_mn_mn_rows.csv', index=False)

print(f"Saved {len(mn_mn_mn_rows)} rows to 'mn_mn_mn_rows.csv'.")

In [7]:
# Filter rows where 'mos' starts with "mm" and extract the next two codes
mm_sequences = df['mos'][df['mos'].str.startswith('mm')].str.split().str[1:3]

# Convert the list of codes to a string for easier counting
mm_sequences_str = mm_sequences.apply(lambda x: ' '.join(x))

# Count unique sequences and their occurrences
mm_unique_counts = mm_sequences_str.value_counts()

print("Unique cases where 'mos' starts with 'mm' and their counts:")
print(mm_unique_counts)
print(f"Total unique cases: {mm_unique_counts.count()}")


Unique cases where 'mos' starts with 'mm' and their counts:
mos
IA IA    4652
mm IA     175
mm mm      15
Name: count, dtype: int64
Total unique cases: 3


In [None]:
# Filter rows where 'mos' starts with "mm" followed by "mm mm"
mm_mm_mm_rows = df[df['mos'].str.startswith('mm mm mm')]

# Save these rows to a CSV file
mm_mm_mm_rows.to_csv('mm_mm_mm_rows.csv', index=False)

print(f"Saved {len(mm_mm_mm_rows)} rows to 'mm_mm_mm_rows.csv'.")

Count of "IA" followed by "BA"

In [9]:
# Filter rows where 'mos' has "IA BA" sequence and count them
ia_ba_count = df[df['mos'].str.contains('IA BA')].shape[0]

print(f'"IA" followed by "BA" occurs {ia_ba_count} times.')


"IA" followed by "BA" occurs 584268 times.


Rows containing IA, BA, and TR in any order

In [10]:
# Using regular expressions to find rows where IA, BA, and TR appear in any order
rows_with_ia_ba_tr = df[df['mos'].str.contains('IA') & df['mos'].str.contains('BA') & df['mos'].str.contains('TR')].shape[0]

print(f'Rows containing IA, BA, and TR in any order: {rows_with_ia_ba_tr}')


Rows containing IA, BA, and TR in any order: 498342


Rows ending with TR, divided by "resolved" value

In [11]:
# Count of rows ending with TR and resolved status
rows_ending_with_tr_resolved = df[(df['mos'].str.endswith('TR')) & (df['resolved'] == 'resolved')].shape[0]

# Count of rows ending with TR and floor status
rows_ending_with_tr_floor = df[(df['mos'].str.endswith('TR')) & (df['resolved'] == 'floor')].shape[0]

print(f'Rows ending with TR and resolved: {rows_ending_with_tr_resolved}')
print(f'Rows ending with TR and floor: {rows_ending_with_tr_floor}')


Rows ending with TR and resolved: 853475
Rows ending with TR and floor: 366217


Count of Resolved values

In [12]:
# Count the occurrences of each unique value in the 'resolved' column
resolved_counts = df['resolved'].value_counts()

print("Counts of unique values in the 'resolved' column:")
print(resolved_counts)

Counts of unique values in the 'resolved' column:
resolved
resolved    1432581
floor        366217
Name: count, dtype: int64


#### Analyzing most impactful codes

In [13]:
def clean_mos_sequence(mos):
    # Split the sequence into codes
    codes = mos.split()
    
    # Remove everything up to the first "IA" including that "IA"
    if "IA" in codes:
        first_ia_index = codes.index("IA") + 1  # Move past the first "IA"
        codes = codes[first_ia_index:]
    
    # Now, remove consecutive "IAs" that follow immediately after
    while codes and codes[0] == "IA":
        codes.pop(0)  # Keep removing "IA" until we encounter a different code
    
    # Remove all "BA" and "TR" codes from what remains
    codes = [code for code in codes if code not in ("BA", "TR")]
    
    return ' '.join(codes)

# Apply the adjusted cleaning function to the 'mos' column
df['cleaned_mos'] = df['mos'].apply(clean_mos_sequence)


In [None]:
df.to_csv('data_cleaned_mos.csv', index=False)

Logistic Regression

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

# Assuming df and clean_mos_sequence_v2 as before and df['cleaned_mos_v2'] is ready

# Encoding the 'cleaned_mos_v2' sequences
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)

# Encoding the 'resolved' column (1 for 'floor', 0 otherwise)
y = (df['resolved'] == 'floor').astype(int)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

# Creating a DataFrame to display feature importance
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Sorting features by absolute importance
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance_sorted = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

# Displaying the top influential features
print(feature_importance_sorted)




   Feature  Coefficient  Absolute Coefficient
39      RS    10.079166             10.079166
47      TN    -5.391877              5.391877
42      TB    -5.041593              5.041593
29      PC     4.310617              4.310617
36      RC     4.281609              4.281609
..     ...          ...                   ...
59      eS     0.045563              0.045563
30      PI    -0.044125              0.044125
69      iT     0.024342              0.024342
56      eB    -0.021953              0.021953
48      TP    -0.009868              0.009868

[78 rows x 3 columns]


In [48]:
feature_importance_sorted.to_csv('logistic_features.csv', index=False)

ML - XGBoost

In [50]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming the data preparation steps are the same
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Extract feature importance
feature_importances = xgb_model.feature_importances_
feature_names = vectorizer.get_feature_names_out()
ml_feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
ml_feature_importance_df = ml_feature_importance_df.sort_values(by='Importance', ascending=False)

print(ml_feature_importance_df)



   Feature  Importance
75      mt    0.313048
39      RS    0.142852
35      Pd    0.055902
33      PT    0.055104
14      FI    0.038958
..     ...         ...
56      eB    0.000000
27      OC    0.000000
51      Te    0.000000
45      TE    0.000000
62      iF    0.000000

[78 rows x 2 columns]


ML - Linear Support Vector Machine (SVM) with Linear Kernel

In [52]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

svm_model = LinearSVC(max_iter=10000)
svm_model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = svm_model.coef_[0]
feature_importance_svm = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance_svm['Absolute Coefficient'] = feature_importance_svm['Coefficient'].abs()
feature_importance_svm_sorted = feature_importance_svm.sort_values(by='Absolute Coefficient', ascending=False)

print(feature_importance_svm_sorted.head(20))




   Feature  Coefficient  Absolute Coefficient
39      RS     2.589569              2.589569
47      TN    -1.857296              1.857296
10      DE     1.735202              1.735202
42      TB    -1.667842              1.667842
4       BL     1.517775              1.517775
36      RC     1.513772              1.513772
14      FI     1.468856              1.468856
15      FM     1.456148              1.456148
29      PC     1.453462              1.453462
66      iP     1.447207              1.447207
9       CT     1.419551              1.419551
25      NU     1.406764              1.406764
44      TD    -1.397162              1.397162
68      iS     1.357198              1.357198
43      TC    -1.301690              1.301690
21      LS     1.284685              1.284685
71      me     1.281550              1.281550
8       CD     1.246708              1.246708
35      Pd     1.234342              1.234342
46      TL    -1.228536              1.228536


In [53]:
feature_importance_svm_sorted.to_csv('svm_features.csv', index=False)

Ridge Classifier (with L2 Regularization)

In [54]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)
X = vectorizer.fit_transform(df['cleaned_mos'].values)
y = (df['resolved'] == 'floor').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge_model = RidgeClassifier()
ridge_model.fit(X_train, y_train)

# Extracting feature importance
feature_names = vectorizer.get_feature_names_out()
coefficients = ridge_model.coef_[0]
feature_importance_ridge = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance_ridge['Absolute Coefficient'] = feature_importance_ridge['Coefficient'].abs()
feature_importance_ridge_sorted = feature_importance_ridge.sort_values(by='Absolute Coefficient', ascending=False)

print(feature_importance_ridge_sorted.head(20))




   Feature  Coefficient  Absolute Coefficient
39      RS     2.026062              2.026062
47      TN    -1.779387              1.779387
42      TB    -1.631254              1.631254
10      DE     1.562741              1.562741
4       BL     1.497005              1.497005
14      FI     1.473131              1.473131
66      iP     1.452929              1.452929
29      PC     1.452430              1.452430
36      RC     1.449985              1.449985
15      FM     1.419483              1.419483
68      iS     1.388167              1.388167
44      TD    -1.387542              1.387542
43      TC    -1.277350              1.277350
21      LS     1.266163              1.266163
71      me     1.263555              1.263555
8       CD     1.239214              1.239214
35      Pd     1.228700              1.228700
46      TL    -1.214026              1.214026
60      eY     1.141885              1.141885
20      LC     1.063283              1.063283


In [55]:
feature_importance_ridge_sorted.to_csv('ridge_features.csv', index=False)

Calculating mean coefficients, considering values from Logistic Regression, SVM, and Ridge Classifier

In [57]:
import pandas as pd

# Load the CSV files into DataFrames
df_logreg = pd.read_csv('logistic_features.csv')
df_svm = pd.read_csv('svm_features.csv')
df_ridge = pd.read_csv('ridge_features.csv')

# Normalize the Coefficients
df_logreg['Normalized Coefficient'] = df_logreg['Coefficient'] / df_logreg['Absolute Coefficient'].max()
df_svm['Normalized Coefficient'] = df_svm['Coefficient'] / df_svm['Absolute Coefficient'].max()
df_ridge['Normalized Coefficient'] = df_ridge['Coefficient'] / df_ridge['Absolute Coefficient'].max()

# Rank the Features based on Absolute Normalized Coefficients
df_logreg['Rank'] = df_logreg['Absolute Coefficient'].rank(method='min', ascending=False)
df_svm['Rank'] = df_svm['Absolute Coefficient'].rank(method='min', ascending=False)
df_ridge['Rank'] = df_ridge['Absolute Coefficient'].rank(method='min', ascending=False)

# Merge the dataframes on 'Feature', including normalized coefficients
df_merged = pd.merge(df_logreg[['Feature', 'Rank', 'Normalized Coefficient']], df_svm[['Feature', 'Rank', 'Normalized Coefficient']], on='Feature', how='inner', suffixes=('_logreg', '_svm'))
df_merged = pd.merge(df_merged, df_ridge[['Feature', 'Rank', 'Normalized Coefficient']], on='Feature', how='inner', suffixes=('', '_ridge'))

# Calculate Average Rank and Average Coefficient
df_merged['Average Rank'] = df_merged[['Rank_logreg', 'Rank_svm', 'Rank']].mean(axis=1)
df_merged['Average Coefficient'] = df_merged[['Normalized Coefficient_logreg', 'Normalized Coefficient_svm', 'Normalized Coefficient']].mean(axis=1)

# Prepare the final dataframe
df_final = df_merged[['Feature', 'Average Rank', 'Average Coefficient']].sort_values(by='Average Rank')

# Saving the "median" list with average coefficient to a CSV
df_final.to_csv('features_median.csv', index=False)


### Sequence-level Importance Analysis

Identifying most common paths

In [None]:
from collections import Counter

mos_sequences = df['cleaned_mos'].str.split().apply(tuple) 
most_common_paths = Counter(mos_sequences).most_common()

Some pre-processing with the cleaned_mos column

In [None]:
unique_data_types = df['cleaned_mos'].apply(type).unique()
print(unique_data_types)

In [None]:
float_rows = df[df['cleaned_mos'].apply(type) == float]
print(float_rows)

In [None]:
nan_count = df['cleaned_mos'].isna().sum()
print("Number of NaN values in 'cleaned_mos' column:", nan_count)

In [None]:
df.dropna(subset=['cleaned_mos'], inplace=True)

Visualizing most common paths

In [None]:
mos_labels = [' '.join(mos) for mos, count in most_common_paths[:10]]
counts = [count for mos, count in most_common_paths[:10]]

plt.figure(figsize=(10, 8))
plt.barh(mos_labels, counts)
plt.xlabel('Frequency')
plt.ylabel('MOS Path')
plt.title('Top 10 Most Common MOS Paths')
plt.gca().invert_yaxis() 
plt.show()


Grouping cleaned_mos by value and calculating Floor Percentage

In [None]:
mos_sequences_df = df['cleaned_mos'].str.split().apply(tuple).to_frame(name='mos_sequence_tuple').copy()
mos_sequences_df['resolved_status'] = df['resolved']

grouped_counts = mos_sequences_df.groupby(['mos_sequence_tuple', 'resolved_status']).size().unstack(fill_value=0)

grouped_counts['total_occurrences'] = grouped_counts['resolved'] + grouped_counts['floor']

grouped_counts['floor_percentage'] = (grouped_counts['floor'] / grouped_counts['total_occurrences']) * 100

sorted_grouped_counts = grouped_counts.sort_values(by='floor_percentage', ascending=False)

sorted_grouped_counts.reset_index(inplace=True)

sorted_grouped_counts['mos_sequence'] = sorted_grouped_counts['mos_sequence_tuple'].apply(lambda x: ' '.join(x))

mos_df = sorted_grouped_counts[['mos_sequence', 'floor_percentage', 'total_occurrences', 'resolved', 'floor']].copy()

mos_df.rename(columns={
    'mos_sequence': 'MOS Sequence',
    'floor_percentage': 'Floor Call Percentage',
    'total_occurrences': 'Total Occurrences',
    'resolved': 'Number of Resolved Calls',
    'floor': 'Number of Floor Calls'
}, inplace=True)


Calculating weighted scores

In [None]:
w_fp = 0.6 
w_to = 0.4

mos_df['weighted_score'] = (w_fp + w_to) / ((w_fp / (mos_df['Floor Call Percentage'] + 1)) + (w_to / (mos_df['Total Occurrences'] + 1)))

mos_df_sorted = mos_df.sort_values(by='weighted_score', ascending=False)

print(mos_df_sorted[['MOS Sequence', 'Floor Call Percentage', 'Total Occurrences', 'weighted_score']].head(10))


Removing occurrences of "nl" "mt" "mo" "mm" "mn" 

In [None]:

keywords_to_remove = ['nl', 'mt', 'mo', 'mm', 'mn']

def remove_keywords(sequence):
    words = sequence.split()
    cleaned_words = [word for word in words if word not in keywords_to_remove]
    return ' '.join(cleaned_words)

mos_df_sorted['MOS Sequence'] = mos_df_sorted['MOS Sequence'].apply(remove_keywords)

print(mos_df_sorted['MOS Sequence'])

In [None]:
mos_df_sorted.dropna(subset=['MOS Sequence'], inplace=True)

Normalizing the weighted score

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

weighted_score = mos_df_sorted[['weighted_score']]

mos_df_sorted['normalized_weighted_score'] = scaler.fit_transform(weighted_score)

print(mos_df_sorted)

In [None]:
mos_df_sorted.drop(columns=['Floor Call Percentage', 'weighted_score', 'normalized_weighted_score'], inplace=True)

grouped_df = mos_df_sorted.groupby('MOS Sequence').sum().reset_index()

grouped_df['Floor Percentage'] = (grouped_df['Number of Floor Calls'] / grouped_df['Total Occurrences']) * 100


grouped_df.sort_values(by='Floor Percentage', ascending=False, inplace=True)

print(grouped_df)

In [None]:
w_fp = 0.6
w_to = 0.4
grouped_df['weighted_score'] = (w_fp + w_to) / ((w_fp / (grouped_df['Floor Percentage'] + 1)) + (w_to / (grouped_df['Total Occurrences'] + 1)))

grouped_df = grouped_df.sort_values(by='weighted_score', ascending=False)

print(grouped_df[['MOS Sequence', 'Floor Percentage', 'Total Occurrences', 'weighted_score']].head(10))

In [None]:

scaler = MinMaxScaler()

weighted_score = grouped_df[['weighted_score']]

grouped_df['normalized_weighted_score'] = scaler.fit_transform(weighted_score)

print(grouped_df[['MOS Sequence', 'Floor Percentage', 'Total Occurrences', 'weighted_score', 'normalized_weighted_score']].head(10))


In [None]:
grouped_df.to_csv('cleaned_mos_analysis_weighted.csv', index=False)

Visualizing most common codes, derived from the most impactful Sequences

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects 

file_path = 'cleaned_mos_analysis_weighted.csv' 
data = pd.read_csv(file_path)

data_subset = data.head(100)

G = nx.Graph()

node_sizes = {}

for seq in data_subset['MOS Sequence']:
    nodes = seq.split()
    for i in range(len(nodes) - 1):
        G.add_edge(nodes[i], nodes[i + 1])
        node_sizes[nodes[i]] = node_sizes.get(nodes[i], 0) + 1
    node_sizes[nodes[-1]] = node_sizes.get(nodes[-1], 0) + 1

max_size = max(node_sizes.values())
min_size = min(node_sizes.values())
normalized_sizes = {node: ((size - min_size) / (max_size - min_size) * 200 + 20) for node, size in node_sizes.items()}

pos = nx.spring_layout(G, seed=42)

plt.figure(figsize=(12, 9), facecolor='white')
ax = plt.gca()

nodes = nx.draw_networkx_nodes(G, pos, node_size=[normalized_sizes[node] * 20 for node in G.nodes()],
                               node_color=[normalized_sizes[node] for node in G.nodes()],
                               cmap=plt.cm.Pastel2, alpha=1.0, edgecolors='#141414', linewidths=0)

edges = nx.draw_networkx_edges(G, pos, edge_color='#c4c3c3', alpha=0.5, width=1)

labels = nx.draw_networkx_labels(G, pos, font_size=8, font_color="black")

plt.title("MOS Codes which lead to greater Floor calls", fontsize=20)
plt.axis('off')
plt.show()

Saving nodes and edges

In [None]:
import pandas as pd

file_path = 'cleaned_mos_analysis_weighted.csv' 
data = pd.read_csv(file_path)

data_subset = data.head(100)

nodes = set()
edges = []

for index, row in data_subset.iterrows():
    sequence = row['MOS Sequence'].split()
    normalized_weighted_score = row['normalized_weighted_score']
    for i in range(len(sequence) - 1):
        source = sequence[i]
        target = sequence[i + 1]
        edges.append((source, target, normalized_weighted_score))
        nodes.add(source)
        nodes.add(target)

nodes_df = pd.DataFrame({'Id': list(nodes)})
edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])

nodes_file_path = 'nodes.csv'
edges_file_path = 'edges.csv'

nodes_df.to_csv(nodes_file_path, index=False)
edges_df.to_csv(edges_file_path, index=False)

print(f"Nodes and edges CSV files have been saved to {nodes_file_path} and {edges_file_path}, respectively.")