In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string
from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [4]:
df_calls = pd.read_csv('calls.csv')
df_test = pd.read_csv('test.csv')

join_column = 'call_id'

df1 = pd.merge(df_calls, df_test, on=join_column, how='right')

In [5]:
columns_to_keep = ['call_id', 'call_transcript']  

test_df = df1[columns_to_keep]

test_df.head()

Unnamed: 0,call_id,call_transcript
0,7732610078,\n\nAgent: Thank you for calling United Airlin...
1,2400299738,\n\nAgent: Thank you for calling United Airlin...
2,6533095063,\n\nAgent: Thank you for calling United Airlin...
3,7774450920,\n\nAgent: Thank you for calling United Airlin...
4,9214147168,\n\nAgent: Thank you for calling United Airlin...


In [6]:
test_df.shape

(5157, 2)

In [7]:
df_reason = pd.read_csv('reason.csv')

join_column = 'call_id'

df2 = pd.merge(df_reason, df_calls, on=join_column, how='left')

In [8]:
columns_to_keep = ['call_id', 'primary_call_reason', 'call_transcript']  

df = df2[columns_to_keep]

df.head()

Unnamed: 0,call_id,primary_call_reason,call_transcript
0,4667960400,Voluntary Cancel,\n\nAgent: Thank you for calling United Airlin...
1,1122072124,Booking,\n\nAgent: Thank you for calling United Airlin...
2,6834291559,IRROPS,\n\nAgent: Thank you for calling United Airlin...
3,2266439882,Upgrade,\n\nAgent: Thank you for calling United Airlin...
4,1211603231,Seating,\n\nAgent: Thank you for calling United Airlin...


In [9]:
df.shape

(66653, 3)

In [10]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
df.loc[:, 'primary_call_reason'] = df['primary_call_reason'].str.replace(' ', '', regex=True).str.replace('-', '', regex=True).str.lower()

print(df['primary_call_reason'].unique())

['voluntarycancel' 'booking' 'irrops' 'upgrade' 'seating' 'mileageplus'
 'checkout' 'voluntarychange' 'postflight' 'checkin' 'othertopics'
 'communications' 'schedulechange' 'products&services' 'digitalsupport'
 'disability' 'unaccompaniedminor' 'baggage' 'travelerupdates' 'etc'
 'productsandservices']


In [36]:
def preprocess_text(text):
    text = text.lower()
    
    pattern = r'\b\w+\b'
    tokens = re.findall(pattern, text)

    relevant_words = {
        'voluntarycancel', 'booking', 'irrops', 'upgrade', 'seating', 
        'mileageplus', 'checkout', 'voluntarychange', 'postflight', 
        'checkin', 'othertopics', 'communications', 'schedulechange', 
        'products&services', 'digitalsupport', 'disability', 
        'unaccompaniedminor', 'baggage', 'travelerupdates', 'etc', 
        'productsandservices'
    }

    stop_words = set(stopwords.words('english')).union({
        'thank', 'you', 'hello', 'hi', 'goodbye', 'thanks', 'agent', 
        'call', 'please', 'i', 'me', 'my', 'are', 'is', 'to', 
        'the', 'for', 'that', 'it', 'on', 'in', 'with', 'customer', 
        'let', 'change', 'help', 'like', 'thank', 'would', 'united', 
        'calling', 'okay', 'typing', 'get', 'next', 'work', 
        'day', 'take', 'yeah', 'need'
    })

    filtered_tokens = [word for word in tokens if word in relevant_words]
    
    return filtered_tokens

In [37]:
word_counts = {}

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Transcripts"):
    call_reason = row['primary_call_reason']
    transcript = row['call_transcript']
    
    tokens = preprocess_text(transcript)
    
    if call_reason not in word_counts:
        word_counts[call_reason] = Counter()
    
    word_counts[call_reason].update(tokens)

top_words = {reason: counts.most_common(10) for reason, counts in word_counts.items()}

for reason, words in tqdm(top_words.items(), desc="Finding Top Words"):
    print(f"Primary Call Reason: {reason}")
    print("Top 10 Words:", words)
    print()

Processing Transcripts: 100%|██████████████████████████████████████████████████| 66653/66653 [00:27<00:00, 2450.81it/s]
Finding Top Words: 100%|███████████████████████████████████████████████████████████████████████| 21/21 [00:00<?, ?it/s]

Primary Call Reason: voluntarycancel
Top 10 Words: [('booking', 913), ('upgrade', 433), ('baggage', 85), ('etc', 65), ('seating', 39), ('mileageplus', 27), ('communications', 3)]

Primary Call Reason: booking
Top 10 Words: [('booking', 1756), ('upgrade', 496), ('baggage', 98), ('etc', 72), ('seating', 49), ('mileageplus', 17), ('communications', 4), ('checkout', 1)]

Primary Call Reason: irrops
Top 10 Words: [('booking', 6973), ('upgrade', 3137), ('baggage', 573), ('etc', 454), ('seating', 319), ('mileageplus', 92), ('communications', 12), ('checkout', 10)]

Primary Call Reason: upgrade
Top 10 Words: [('booking', 1531), ('upgrade', 786), ('baggage', 121), ('etc', 86), ('seating', 66), ('mileageplus', 16), ('communications', 4), ('checkout', 2)]

Primary Call Reason: seating
Top 10 Words: [('booking', 3337), ('upgrade', 1645), ('baggage', 286), ('etc', 213), ('seating', 171), ('mileageplus', 38), ('checkout', 5), ('communications', 4)]

Primary Call Reason: mileageplus
Top 10 Words: [('




In [38]:
# created a feature vector for each transcript based on the top 10 words for each call reason
def create_features(transcript, top_words):
    tokens = preprocess_text(transcript)
    features = {}
    
    for reason, words in top_words.items():
        common_words = [word for word, count in words]
        features[reason] = sum(1 for word in tokens if word in common_words)
    
    return features

train_features = pd.DataFrame(
    [create_features(row['call_transcript'], top_words) for index, row in tqdm(df.iterrows(), total=len(df), desc="Creating Feature Vectors")]
)

# added the primary call reason as the target variable
train_features['primary_call_reason'] = df['primary_call_reason']

train_features.head()

Creating Feature Vectors: 100%|████████████████████████████████████████████████| 66653/66653 [00:27<00:00, 2419.55it/s]


Unnamed: 0,voluntarycancel,booking,irrops,upgrade,seating,mileageplus,checkout,voluntarychange,postflight,checkin,...,schedulechange,products&services,digitalsupport,disability,unaccompaniedminor,baggage,travelerupdates,etc,productsandservices,primary_call_reason
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,voluntarycancel
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,booking
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,irrops
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,upgrade
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,seating


In [43]:
#validation split and accuracy for logistic model
X = train_features.drop('primary_call_reason', axis=1)
y = train_features['primary_call_reason']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)
validation_accuracy = accuracy_score(y_val, y_pred)

print(f"Validation Accuracy: {validation_accuracy:.2f}")
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.21
Classification Report on Validation Set:
                     precision    recall  f1-score   support

            baggage       0.00      0.00      0.00       604
            booking       0.00      0.00      0.00       513
            checkin       0.00      0.00      0.00       359
           checkout       0.00      0.00      0.00       384
     communications       0.00      0.00      0.00       757
     digitalsupport       0.00      0.00      0.00       255
         disability       0.00      0.00      0.00        86
                etc       0.00      0.00      0.00       197
             irrops       0.21      0.99      0.34      2763
        mileageplus       0.00      0.00      0.00      1130
        othertopics       0.00      0.00      0.00       174
         postflight       0.00      0.00      0.00       848
  products&services       0.00      0.00      0.00        86
productsandservices       0.00      0.00      0.00       572
     schedulechan

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
test_features = pd.DataFrame([create_features(row['call_transcript'], top_words) for index, row in test_df.iterrows()])

test_predictions = model.predict(test_features)

test_df['primary_call_reason'] = test_predictions

output_df = test_df[['call_id', 'primary_call_reason']]
output_df.to_csv('test_vaibhav&aryan.csv', index=False)

print("Predictions saved to 'test_vaibhav&aryan.csv'.")

Predictions saved to 'test_vaibhav&aryan.csv'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['primary_call_reason'] = test_predictions
