In [3]:
import numpy  as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import matplotlib.pyplot as plt

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [29]:
call_reason_merged = pd.read_csv('calls_reason.csv')
call = pd.read_csv('callsf0d4f5a.csv')
reason = pd.read_csv('reason18315ff.csv')

In [166]:
cleaned_data_100 = pd.read_csv('merged_table_100_good.csv')

In [37]:
cleaned_data_200 = pd.read_csv('merged_table_200.csv')
cleaned_data_300 = pd.read_csv('merged_table_300.csv')

In [167]:
cleaned_data_100.head()

Unnamed: 0.1,Unnamed: 0,call_id,customer_id,agent_id,call_transcript,call_start_datetime,agent_assigned_datetime,call_end_datetime,primary_call_reason
0,0,3817230689,390235425,922729,clair clair seattl thursday johnson ua1572 10a...,2024-08-01 05:03:00,2024-08-01 05:12:00,2024-08-01 05:25:00,Post Flight
1,1,4886165173,9360139623,964561,steve suppos 3 miss connect sorri hear never f...,2024-08-01 05:26:00,2024-08-01 05:36:00,2024-08-01 06:20:00,Post Flight
2,2,9310693795,4367883724,758927,adam complain suppos morn miss someth import s...,2024-08-01 05:52:00,2024-08-01 06:02:00,2024-08-01 06:06:00,Seating
3,3,4963458977,5621041498,274746,sarah sarah york 3 miss connect sorri hear tic...,2024-08-01 06:04:00,2024-08-01 06:08:00,2024-08-01 06:11:00,Communications
4,4,4458776244,5335489097,878048,steve suppos morn miss import sorri hear sir k...,2024-08-01 06:00:00,2024-08-01 06:09:00,2024-08-01 06:24:00,Seating


In [168]:
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features if needed
X = vectorizer.fit_transform(cleaned_data_100['call_transcript'])


In [124]:
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features if needed
X = vectorizer.fit_transform(call_reason_merged['call_transcript'])


In [171]:
split_dataframes={}
for i  in cleaned_data_100['primary_call_reason']:
    split_dataframes[i] = cleaned_data_100[cleaned_data_100['primary_call_reason']==i]
    print(i)

Post Flight
Post Flight
Seating
Communications
Seating
Traveler Updates
Communications
Upgrade
IRROPS
Seating
IRROPS
IRROPS
Products and Services
Mileage Plus
nan
Mileage Plus
Seating
IRROPS
Voluntary Change
Booking
nan
IRROPS
IRROPS
Voluntary Change
Post Flight
IRROPS
nan
Seating
Seating
Voluntary Change
Traveler Updates
Digital Support
nan
Seating
Post Flight
Products and Services
Post Flight
Products and Services
Digital Support
nan
IRROPS
Seating
Traveler Updates
IRROPS
IRROPS
Communications
nan
Upgrade
Disability
Voluntary Change
IRROPS
Mileage Plus
Post Flight
Voluntary Cancel
Booking
Booking
IRROPS
Upgrade
Digital Support
Voluntary Change
Baggage
Seating
Post Flight
ETC
Seating
Seating
IRROPS
Mileage Plus
IRROPS
Voluntary Change
Voluntary Cancel
IRROPS
IRROPS
IRROPS
nan
Other Topics
Post Flight
Check In
IRROPS
Baggage
Voluntary Change
IRROPS
Seating
Baggage
Post Flight
Voluntary Change
Products and Services
Post Flight
nan
Schedule Change
ETC
Upgrade
nan
Products and Services
Di

In [183]:
# split_dataframes_uncleaned={}
# for i in call_reason_merged['primary_call_reason']:
#     split_dataframes_uncleaned[i] = call_reason_merged[call_reason_merged['primary_call_reason']==i]
#     print(i)

In [190]:
def apply_lda_to_dataframe(df, n_topics=4, n_top_words=10):
    
    # Step 1: Vectorize the text data using CountVectorizer
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(df['call_transcript'])
    
    # Step 2: Apply LDA
    lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_model.fit(doc_term_matrix)
    
    # Step 3: Extract the top words for each topic
    words = vectorizer.get_feature_names()  
    
    topic_keywords = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]  # Top `n_top_words`
        topic_keywords.append(top_words)
    
    # Step 4: Get the topic distribution for each document
    topic_distribution = lda_model.transform(doc_term_matrix)
    
    return lda_model, topic_keywords, topic_distribution

In [191]:
lda_models = {}
topic_keywords = {}
topic_distributions = {}

# Number of topics you want to find per call reason
n_topics_per_reason = 4

# Apply LDA to each split DataFrame
for reason, df in split_dataframes.items():
    if not df.empty:  # Check if the DataFrame is not empty
        print(f"Applying LDA for reason: {reason}")
        lda_model, keywords, distribution = apply_lda_to_dataframe(df, n_topics=n_topics_per_reason)
        
        # Store the LDA model, topic keywords, and topic distribution for each reason
        lda_models[reason] = lda_model
        topic_keywords[reason] = keywords
        topic_distributions[reason] = distribution

        # Assign topic distribution and dominant topic to each transcript in the DataFrame
        df['topic_distribution'] = list(distribution)
        df['dominant_topic'] = distribution.argmax(axis=1)

        # Save the updated DataFrame back
        split_dataframes[reason] = df


Applying LDA for reason: Post Flight
Applying LDA for reason: Seating
Applying LDA for reason: Communications
Applying LDA for reason: Traveler Updates
Applying LDA for reason: Upgrade
Applying LDA for reason: IRROPS
Applying LDA for reason: Products and Services
Applying LDA for reason: Mileage Plus
Applying LDA for reason: Voluntary Change
Applying LDA for reason: Booking
Applying LDA for reason: Digital Support
Applying LDA for reason: Disability
Applying LDA for reason: Voluntary Cancel
Applying LDA for reason: Baggage
Applying LDA for reason: ETC
Applying LDA for reason: Other Topics
Applying LDA for reason: Check In
Applying LDA for reason: Schedule Change
Applying LDA for reason: Checkout
Applying LDA for reason: Unaccompanied Minor


In [192]:
# lda_models_uncleaned = {}
# topic_keywords_uncleaned = {}
# topic_distributions_uncleaned = {}

# # Number of topics you want to find per call reason
# n_topics_per_reason = 4

# # Apply LDA to each split DataFrame
# for reason, df in split_dataframes_uncleaned.items():
#     if not df.empty:  # Check if the DataFrame is not empty
#         print(f"Applying LDA for reason: {reason}")
#         lda_model_uncleaned, keywords_uncleaned, distribution_uncleaned = apply_lda_to_dataframe(df, n_topics=n_topics_per_reason)
        
#         # Store the LDA model, topic keywords, and topic distribution for each reason
#         lda_models_uncleaned[reason] = lda_model_uncleaned
#         topic_keywords_uncleaned[reason] = keywords_uncleaned
#         topic_distributions_uncleaned[reason] = distribution_uncleaned

#         # Assign topic distribution and dominant topic to each transcript in the DataFrame
#         df['topic_distribution'] = list(distribution_uncleaned)
#         df['dominant_topic'] = distribution_uncleaned.argmax(axis=1)

#         # Save the updated DataFrame back
#         split_dataframes_uncleaned[reason] = df


In [184]:
# Display the topics for each reason
for reason, topics in topic_keywords.items():
    print(f"\nCall Reason: {reason}")
    for idx, topic in enumerate(topics):
        print(f"  Topic {idx + 1}: {', '.join(topic)}")

        


Call Reason: Post Flight
  Topic 1: voucher, weather, experi, provid, miss, bag, refund, compens, apolog, happen
  Topic 2: tomorrow, sir, apolog, ugh, connect, guess, inconveni, suppos, sigh, york
  Topic 3: credit, refund, ticket, abl, use, offer, 200, someth, feel, pay
  Topic 4: great, monday, return, tuesday, thursday, friday, email, 150, enjoy, safe

Call Reason: Seating
  Topic 1: 150, monday, great, friday, differ, waiv, tuesday, john, thursday, saturday
  Topic 2: upgrad, bag, doubl, question, economi, everyth, think, great, safe, glad
  Topic 3: london, weather, safe, feel, return, thing, departur, forecast, case, standbi
  Topic 4: voucher, sir, apolog, miss, tomorrow, guess, refund, provid, sorri, connect

Call Reason: Communications
  Topic 1: voucher, sir, apolog, miss, refund, guess, tomorrow, provid, sorri, experi
  Topic 2: 150, monday, return, great, friday, differ, waiv, tuesday, thursday, saturday
  Topic 3: upgrad, economi, tri, standbi, class, open, bag, list, th

In [188]:
# # Display the topics for each reason
# for reason, topics in topic_keywords_uncleaned.items():
#     print(f"\nCall Reason: {reason}")
#     for idx, topic in enumerate(topics):
#         print(f"  Topic {idx + 1}: {', '.join(topic)}")


In [196]:
# Display the first few rows of a specific reason DataFrame to show the dominant topics
reason = 'Products and Services'  # Example
print(split_dataframes[reason][['call_transcript', 'dominant_topic']])

                                         call_transcript  dominant_topic
12     phone dial ugh final someon answer hold forev ...               0
35     hello speak jame jame upcom york san francisco...               1
37     john suppos 2pm email say 1pm ua45212 sigh res...               1
86     john complain recent york whole experi nightma...               0
93     michael took york san francisco 3 miss import ...               0
100    kelli san francisco tomorrow 4 ridicul sigh so...               1
130    david final took long someon pick phone wait s...               0
133    sarah suppos tomorrow sorri hear happen due se...               1
172    jack assist sigh angrili london month log acco...               0
192    sam sam ah shoot paper rustl san francisco tue...               2
210    cust hello suppos morn afternoon john cust san...               0
231    john suppos morn gon na miss connect unaccept ...               0
242    john sigh nightmar sorri hear provid rustl p

In [140]:
split_dataframes['Baggage']

Unnamed: 0.1,Unnamed: 0,call_id,customer_id,agent_id,call_transcript,call_start_datetime,agent_assigned_datetime,call_end_datetime,primary_call_reason,topic_distribution,dominant_topic
60,60,687556900,9348058886,391212,steve london worri find rustl paper px456213 g...,2024-08-01 10:07:00,2024-08-01 10:16:00,2024-08-01 10:19:00,Baggage,"[0.0017255498494046096, 0.036716840199106815, ...",2
79,79,2835524012,8798729337,600666,michael speak upcom saturday three alreadi poi...,2024-08-01 10:55:00,2024-08-01 11:04:00,2024-08-01 11:07:00,Baggage,"[0.37946941371564086, 0.49506026417691734, 0.0...",1
83,83,4350015240,6223747949,215457,tomorrow boston lo angel john smith john smith...,2024-08-01 10:52:00,2024-08-01 11:03:00,2024-08-01 11:08:00,Baggage,"[0.7908433216792929, 0.14073251760309777, 0.00...",0
128,128,4075573598,4405753886,436472,evan hello complain recent absolut terribl sor...,2024-08-01 12:32:00,2024-08-01 12:43:00,2024-08-01 12:58:00,Baggage,"[0.0019923665621880684, 0.0020457528474216655,...",3
167,167,1589048482,1536516864,696673,steve upcom sigh nightmar tri figur sir provid...,2024-08-01 14:08:00,2024-08-01 14:16:00,2024-08-01 14:22:00,Baggage,"[0.23327694855792688, 0.664493710852834, 0.099...",1
182,182,7929224394,9455428741,645129,dave uh san francisco suppos 9am wo tomorrow s...,2024-08-03 07:50:00,2024-08-03 08:00:00,2024-08-03 08:02:00,Baggage,"[0.002280577974235544, 0.9930529740522991, 0.0...",1
202,202,607499611,5506632838,309976,steve upset recent complet disast sorri hear b...,2024-08-01 15:47:00,2024-08-01 15:58:00,2024-08-01 15:59:00,Baggage,"[0.001993001759754127, 0.0019639858618458617, ...",3
260,260,4220851729,1691252590,519294,sarah complain took absolut terribl sorri hear...,2024-08-01 18:27:00,2024-08-01 18:38:00,2024-08-01 18:40:00,Baggage,"[0.07385584692333906, 0.0017208316536878997, 0...",3
270,270,6612130758,970082846,633922,raj assist raj san francisco exactli concern d...,2024-08-01 19:46:00,2024-08-01 19:53:00,2024-08-01 20:09:00,Baggage,"[0.966854936590559, 0.0025059105895037003, 0.0...",0
309,309,2444517954,5174097533,519057,sam tri stupid websit ridicul tri keep give er...,2024-08-02 01:26:00,2024-08-02 01:36:00,2024-08-02 01:37:00,Baggage,"[0.20312975101975134, 0.7919572944504246, 0.00...",1


In [131]:
df.head()

Unnamed: 0.1,Unnamed: 0,call_id,customer_id,agent_id,call_start_datetime,agent_assigned_datetime,call_end_datetime,call_transcript,Handle_time,Wait_time,primary_call_reason,hour,topic_distribution,dominant_topic
4712,4712,8326955206,4706903050,595713,2024-08-03 13:15:00,2024-08-03 13:20:00,2024-08-03 13:21:00,\n\nAgent: Thank you for calling United Airlin...,1.0,5.0,Products and Services,13,"[0.27992571371305275, 0.37070599687624467, 0.3...",1
4725,4725,3218394191,8424797559,630891,2024-08-03 13:18:00,2024-08-03 13:22:00,2024-08-03 13:35:00,\n\nAgent: Thank you for calling United Airlin...,13.0,4.0,Products and Services,13,"[0.023574212775492983, 0.0016826199145946533, ...",2
4762,4762,1349952018,4822049335,981779,2024-08-03 13:23:00,2024-08-03 13:28:00,2024-08-03 13:38:00,\n\nAgent: Thank you for calling United Airlin...,10.0,5.0,Products and Services,13,"[0.0012422985117737815, 0.9961851660397325, 0....",1
4766,4766,4320026620,8873327345,642658,2024-08-03 13:24:00,2024-08-03 13:29:00,2024-08-03 14:15:00,\n\nAgent: Thank you for calling United Airlin...,46.0,5.0,Products and Services,13,"[0.0014983535194287344, 0.0015061871022569444,...",2
4788,4788,793280037,3878408890,437196,2024-08-03 13:26:00,2024-08-03 13:31:00,2024-08-03 13:32:00,\n\nAgent: Thank you for calling United Airlin...,1.0,5.0,Products and Services,13,"[0.002148028879356053, 0.13649338367109723, 0....",3


In [195]:

for reason, topics in topic_keywords.items():
    # Create an empty list to store the topics with their keywords as strings
    topic_descriptions = []
    
    # Iterate over each topic and create a string with the topic index and its keywords
    for idx, topic in enumerate(topics):
        topic_str = f"Topic {idx + 1}: {', '.join(topic)}"
        topic_descriptions.append(topic_str)
    
    # Convert the list of topic descriptions into a single string (for storing in the DataFrame)
    all_topics_str = "\n".join(topic_descriptions)
    
    # Check if the reason exists in split_dataframes
    if reason in split_dataframes:
        # Add a new column 'topic_keywords' to store the topics in the DataFrame for that reason
        split_dataframes[reason]['topic_keywords'] = all_topics_str
    
    # Optional: You can print or inspect the DataFrame to verify
    print(f"\nStored topics for reason: {reason}")
    print(split_dataframes[reason].head())  # Display first few rows to verify



Stored topics for reason: Post Flight
    Unnamed: 0     call_id  customer_id  agent_id  \
0            0  3817230689    390235425    922729   
1            1  4886165173   9360139623    964561   
24          24  5401438897   6942343286    274746   
34          34  6062040810   4689597701    131036   
36          36   949356313   6440512032    215457   

                                      call_transcript  call_start_datetime  \
0   clair clair seattl thursday johnson ua1572 10a...  2024-08-01 05:03:00   
1   steve suppos 3 miss connect sorri hear never f...  2024-08-01 05:26:00   
24  rachel rachel come 4532 denver tuesday questio...  2024-08-01 09:00:00   
34  anna speak anna upcom york lo angel found retu...  2024-08-01 09:13:00   
36  dave denver suppos yesterday miss connect sorr...  2024-08-01 09:20:00   

   agent_assigned_datetime    call_end_datetime primary_call_reason  \
0      2024-08-01 05:12:00  2024-08-01 05:25:00         Post Flight   
1      2024-08-01 05:36:00  202

In [194]:
split_dataframes['Seating']['dominant_topic'].value_counts()

0    2890
3    2238
2     646
1     591
Name: dominant_topic, dtype: int64

In [197]:
split_dataframes['Post Flight']

Unnamed: 0.1,Unnamed: 0,call_id,customer_id,agent_id,call_transcript,call_start_datetime,agent_assigned_datetime,call_end_datetime,primary_call_reason,topic_distribution,dominant_topic,topic_keywords
0,0,3817230689,390235425,922729,clair clair seattl thursday johnson ua1572 10a...,2024-08-01 05:03:00,2024-08-01 05:12:00,2024-08-01 05:25:00,Post Flight,"[0.07172884852518624, 0.002286080372992108, 0....",3,"Topic 1: voucher, weather, experi, provid, mis..."
1,1,4886165173,9360139623,964561,steve suppos 3 miss connect sorri hear never f...,2024-08-01 05:26:00,2024-08-01 05:36:00,2024-08-01 06:20:00,Post Flight,"[0.002157111262225661, 0.7512064752438898, 0.2...",1,"Topic 1: voucher, weather, experi, provid, mis..."
24,24,5401438897,6942343286,274746,rachel rachel come 4532 denver tuesday questio...,2024-08-01 09:00:00,2024-08-01 09:09:00,2024-08-01 09:41:00,Post Flight,"[0.0022386813477952763, 0.18753205900471942, 0...",3,"Topic 1: voucher, weather, experi, provid, mis..."
34,34,6062040810,4689597701,131036,anna speak anna upcom york lo angel found retu...,2024-08-01 09:13:00,2024-08-01 09:22:00,2024-08-01 09:35:00,Post Flight,"[0.0022040487495878593, 0.2825687401282032, 0....",3,"Topic 1: voucher, weather, experi, provid, mis..."
36,36,949356313,6440512032,215457,dave denver suppos yesterday miss connect sorr...,2024-08-01 09:20:00,2024-08-01 09:30:00,2024-08-01 09:58:00,Post Flight,"[0.002298233799500613, 0.9930623912474527, 0.0...",1,"Topic 1: voucher, weather, experi, provid, mis..."
52,52,2306900525,8636556192,153460,chri ugh final someon answer hold forev mad ap...,2024-08-01 09:45:00,2024-08-01 09:55:00,2024-08-01 10:17:00,Post Flight,"[0.950124008906379, 0.04570060651587364, 0.002...",0,"Topic 1: voucher, weather, experi, provid, mis..."
62,62,3789748403,3646033723,153460,david david uh ua1245 denver keyboard question...,2024-08-01 10:10:00,2024-08-01 10:19:00,2024-08-01 10:24:00,Post Flight,"[0.40688669787624093, 0.002109954023321613, 0....",3,"Topic 1: voucher, weather, experi, provid, mis..."
76,76,7161945208,5030908011,825827,steve absolut nightmar 5 explan miss import ru...,2024-08-01 10:41:00,2024-08-01 10:51:00,2024-08-01 10:58:00,Post Flight,"[0.8340607032163799, 0.0021348974567559054, 0....",0,"Topic 1: voucher, weather, experi, provid, mis..."
84,84,73275311,3654551536,825827,john john come background assist 1993 san fran...,2024-08-01 10:56:00,2024-08-01 11:06:00,2024-08-01 11:16:00,Post Flight,"[0.24573655068885175, 0.0014731480077162405, 0...",3,"Topic 1: voucher, weather, experi, provid, mis..."
87,87,3709853620,8015461432,462743,david ua1546 houston thursday seem sir um mont...,2024-08-01 11:03:00,2024-08-01 11:13:00,2024-08-01 11:18:00,Post Flight,"[0.0026719258502912297, 0.7679513665767436, 0....",1,"Topic 1: voucher, weather, experi, provid, mis..."


In [198]:
for reason,df in split_dataframes.items():
    split_dataframes[reason].to_csv(f"{reason}.csv")

In [201]:
import pandas as pd

# The new data as a dictionary
new_topic_keywords = {
    "Post Flight": [
        ["voucher", "weather", "experi", "provid", "miss", "bag", "refund", "compens", "apolog", "happen"],
        ["tomorrow", "sir", "apolog", "ugh", "connect", "guess", "inconveni", "suppos", "sigh", "york"],
        ["credit", "refund", "ticket", "abl", "use", "offer", "200", "someth", "feel", "pay"],
        ["great", "monday", "return", "tuesday", "thursday", "friday", "email", "150", "enjoy", "safe"]
    ],
    "Seating": [
        ["150", "monday", "great", "friday", "differ", "waiv", "tuesday", "john", "thursday", "saturday"],
        ["upgrad", "bag", "doubl", "question", "economi", "everyth", "think", "great", "safe", "glad"],
        ["london", "weather", "safe", "feel", "return", "thing", "departur", "forecast", "case", "standbi"],
        ["voucher", "sir", "apolog", "miss", "tomorrow", "guess", "refund", "provid", "sorri", "connect"]
    ],
    "Communications": [
        ["voucher", "sir", "apolog", "miss", "refund", "guess", "tomorrow", "provid", "sorri", "experi"],
        ["150", "monday", "return", "great", "friday", "differ", "waiv", "tuesday", "thursday", "saturday"],
        ["upgrad", "economi", "tri", "standbi", "class", "open", "bag", "list", "think", "question"],
        ["weather", "connect", "denver", "safe", "london", "arriv", "depart", "feel", "forecast", "san"]
    ],
    "Traveler Updates": [
        ["monday", "tuesday", "150", "waiv", "sam", "pay", "75", "john", "smith", "someth"],
        ["san", "francisco", "return", "great", "think", "safe", "question", "john", "enjoy", "depart"],
        ["sir", "apolog", "voucher", "miss", "guess", "refund", "connect", "provid", "tomorrow", "sorri"],
        ["friday", "differ", "saturday", "thursday", "great", "pay", "hmm", "ticket", "goodby", "150"]
    ],
    "Upgrade": [
        ["150", "monday", "return", "waiv", "differ", "tuesday", "pay", "great", "hmm", "friday"],
        ["voucher", "refund", "experi", "provid", "compens", "miss", "apolog", "sir", "offer", "sorri"],
        ["tomorrow", "connect", "ugh", "sir", "guess", "denver", "apolog", "inconveni", "san", "suppos"],
        ["great", "question", "safe", "departur", "think", "email", "depart", "convers", "glad", "enjoy"]
    ],
    "IRROPS": [
        ["weather", "london", "feel", "question", "safe", "upgrad", "think", "bag", "depart", "great"],
        ["tomorrow", "connect", "sir", "ugh", "guess", "apolog", "inconveni", "york", "miss", "sigh"],
        ["monday", "150", "friday", "great", "tuesday", "differ", "thursday", "saturday", "john", "smith"],
        ["refund", "voucher", "experi", "miss", "compens", "provid", "offer", "apolog", "sir", "500"]
    ],
    "Products and Services": [
        ["sir", "voucher", "refund", "apolog", "guess", "miss", "provid", "sorri", "inconveni", "compens"],
        ["tomorrow", "connect", "denver", "weather", "safe", "arriv", "morn", "depart", "ugh", "email"],
        ["monday", "return", "great", "150", "thursday", "tuesday", "friday", "email", "waiv", "differ"],
        ["upgrad", "bag", "credit", "economi", "think", "thing", "ticket", "question", "london", "class"]
    ],
    "Mileage Plus": [
        ["refund", "voucher", "experi", "miss", "provid", "offer", "compens", "apolog", "sir", "sorri"],
        ["question", "weather", "london", "safe", "departur", "feel", "think", "great", "depart", "doubl"],
        ["tomorrow", "connect", "sir", "ugh", "guess", "miss", "apolog", "inconveni", "sigh", "denver"],
        ["return", "150", "monday", "great", "friday", "thursday", "pay", "tuesday", "waiv", "differ"]
    ],
    "Voluntary Change": [
        ["150", "waiv", "monday", "differ", "pay", "credit", "return", "ticket", "friday", "abl"],
        ["upgrad", "connect", "denver", "tomorrow", "ugh", "tri", "arriv", "economi", "open", "minut"],
        ["voucher", "sir", "apolog", "miss", "refund", "provid", "guess", "sorri", "weather", "experi"],
        ["great", "friday", "depart", "enjoy", "email", "san", "return", "safe", "francisco", "question"]
    ],
    "Booking": [
        ["monday", "return", "150", "friday", "tuesday", "great", "differ", "thursday", "hmm", "pay"],
        ["tomorrow", "connect", "sir", "ugh", "guess", "apolog", "denver", "inconveni", "miss", "sigh"],
        ["voucher", "refund", "experi", "provid", "miss", "sir", "compens", "apolog", "offer", "sorri"],
        ["question", "weather", "london", "safe", "depart", "great", "feel", "think", "doubl", "glad"]
    ],
    "Digital Support": [
        ["150", "monday", "friday", "return", "differ", "tuesday", "great", "pay", "email", "hmm"],
        ["refund", "credit", "voucher", "provid", "experi", "offer", "compens", "sir", "futur", "miss"],
        ["connect", "tomorrow", "miss", "apolog", "sir", "guess", "voucher", "ugh", "inconveni", "suppos"],
        ["san", "francisco", "question", "upgrad", "safe", "weather", "departur", "think", "doubl", "feel"]
    ],
    "Disability": [
        ["refund", "voucher", "credit", "provid", "experi", "futur", "compens", "weather", "miss", "offer"],
        ["upgrad", "bag", "class", "apolog", "economi", "guess", "sir", "caus", "busi", "mile"],
        ["connect", "miss", "tomorrow", "denver", "ugh", "guess", "sir", "apolog", "rebook", "suppos"],
        ["great", "thursday", "return", "friday", "150", "tuesday", "safe", "question", "think", "departur"]
    ],
    "Voluntary Cancel": [
        ["voucher", "sir", "apolog", "refund", "miss", "guess", "sorri", "provid", "tomorrow", "inconveni"],
        ["connect", "weather", "san", "francisco", "safe", "arriv", "standbi", "forecast", "denver", "case"],
        ["great", "upgrad", "saturday", "return", "convers", "london", "natur", "tone", "come", "think"],
        ["150", "monday", "waiv", "differ", "pay", "san", "francisco", "friday", "tuesday", "credit"]
    ],
    "Baggage": [
        ["150", "friday", "return", "great", "monday", "tuesday", "differ", "waiv", "pay", "thursday"],
        ["tomorrow", "connect", "sir", "ugh", "guess", "apolog", "inconveni", "sigh", "miss", "denver"],
        ["question", "weather", "feel", "london", "safe", "doubl", "thing", "depart", "think", "departur"],
        ["voucher", "refund", "experi", "miss", "provid", "offer", "compens", "apolog", "bag", "weather"]
    ],
    "ETC": [
        ["tomorrow", "connect", "denver", "miss", "apolog", "sir", "guess", "ugh", "sorri", "inconveni"],
        ["refund", "voucher", "sir", "provid", "experi", "apolog", "weather", "miss", "guess", "compens"],
        ["san", "francisco", "great", "tuesday", "question", "thursday", "safe", "departur", "glad", "think"],
        ["friday", "return", "150", "monday", "differ", "waiv", "pay", "credit", "hmm", "someth"]
    ],
    "Other Topics": [
        ["weather", "question", "departur", "safe", "bag", "feel", "london", "glad", "think", "doubl"],
        ["tomorrow", "connect", "miss", "sir", "ugh", "apolog", "guess", "inconveni", "denver", "suppos"],
        ["refund", "experi", "voucher", "sorri", "compens", "apolog", "offer", "provid", "miss", "credit"],
        ["return", "great", "friday", "monday", "150", "differ", "hmm", "tuesday", "john", "saturday"]
    ],
    "Check In": [
        ["san", "francisco", "safe", "question", "feel", "thing", "great", "weather", "come", "glad"],
        ["experi", "bag", "refund", "voucher", "provid", "offer", "apolog", "sorri", "compens", "happen"],
        ["150", "return", "friday", "monday", "differ", "great", "pay", "hmm", "waiv", "tuesday"],
        ["miss", "tomorrow", "sir", "connect", "apolog", "voucher", "guess", "ugh", "inconveni", "sorri"]
    ],
    "Schedule Change": [
        ["tomorrow", "connect", "sir", "ugh", "guess", "york", "miss", "apolog", "inconveni", "denver"],
        ["refund", "voucher", "experi", "provid", "compens", "miss", "offer", "500", "apolog", "sorri"],
        ["150", "monday", "friday", "great", "thursday", "tuesday", "john", "return", "hmm", "differ"],
        ["weather", "upgrad", "question", "bag", "safe", "mind", "doubl", "feel", "think", "forecast"]
    ],
    "Checkout": [
        ["credit", "refund", "offer", "ticket", "futur", "abl", "minut", "provid", "use", "sir"],
        ["great", "150", "monday", "return", "thursday", "tuesday", "friday", "email", "enjoy", "think"],
        ["weather", "voucher", "miss", "experi", "provid", "bag", "compens", "caus", "apolog", "sir"],
        ["tomorrow", "connect", "ugh", "denver", "guess", "york", "upgrad", "sir", "inconveni", "apolog"]
    ],
    "Unaccompanied Minor": [
        ["return", "monday", "saturday", "email", "john", "london", "tri", "friday", "york", "tomorrow"],
        ["sam", "150", "great", "paus", "departur", "glad", "standbi", "hmm", "thursday", "abl"],
        ["voucher", "miss", "apolog", "sir", "refund", "guess", "provid", "compens", "bag", "sorri"],
        ["san", "francisco", "tuesday", "great", "question", "pay", "smith", "depart", "enjoy", "bye"]
    ]
}

# Preparing the DataFrame
rows = []
for call_reason, topics in new_topic_keywords.items():
    for i, keywords in enumerate(topics, start=1):
        rows.append([call_reason, f"{i-1}", ", ".join(keywords)])

# Creating DataFrame
df = pd.DataFrame(rows, columns=["Call Reason", "Topic", "Keywords"])

# Display DataFrame
df


Unnamed: 0,Call Reason,Topic,Keywords
0,Post Flight,0,"voucher, weather, experi, provid, miss, bag, r..."
1,Post Flight,1,"tomorrow, sir, apolog, ugh, connect, guess, in..."
2,Post Flight,2,"credit, refund, ticket, abl, use, offer, 200, ..."
3,Post Flight,3,"great, monday, return, tuesday, thursday, frid..."
4,Seating,0,"150, monday, great, friday, differ, waiv, tues..."
5,Seating,1,"upgrad, bag, doubl, question, economi, everyth..."
6,Seating,2,"london, weather, safe, feel, return, thing, de..."
7,Seating,3,"voucher, sir, apolog, miss, tomorrow, guess, r..."
8,Communications,0,"voucher, sir, apolog, miss, refund, guess, tom..."
9,Communications,1,"150, monday, return, great, friday, differ, wa..."


In [180]:
df.head()

Unnamed: 0,Call Reason,Topic,Keywords
0,Post Flight,Topic 1,"voucher, weather, experi, provid, miss, bag, r..."
1,Post Flight,Topic 2,"tomorrow, sir, apolog, ugh, connect, guess, in..."
2,Post Flight,Topic 3,"credit, refund, ticket, abl, use, offer, 200, ..."
3,Post Flight,Topic 4,"great, monday, return, tuesday, thursday, frid..."
4,Seating,Topic 1,"150, monday, great, friday, differ, waiv, tues..."


In [202]:
df.to_csv('keywords.csv')

In [203]:
def predict_reason(transcript_tokens, reason_topics):
    scores = {}
    
    # Loop through each call reason's topic words
    for reason, topic_words in reason_topics.items():
        # Count the overlap between the transcript tokens and the topic words
        overlap_count = sum(1 for word in transcript_tokens if word in topic_words)
        scores[reason] = overlap_count
    
    # Find the call reason with the maximum overlap (highest score)
    predicted_reason = max(scores, key=scores.get)
    return predicted_reason, scoresdef predict_reason(transcript_tokens, reason_topics):
    scores = {}def predict_reason(transcript_tokens, reason_topics):
    scores = {}
    
    # Loop through each call reason's topic words
    for reason, topic_words in reason_topics.items():
        # Count the overlap between the transcript tokens and the topic words
        overlap_count = sum(1 for word in transcript_tokens if word in topic_words)
        scores[reason] = overlap_count
    
    # Find the call reason with the maximum overlap (highest score)
    predicted_reason = max(scores, key=scores.get)
    return predicted_reason, scores
    
    # Loop through each call reason's topic words
    for reason, topic_words in reason_topics.items():
        # Count the overlap between the transcript tokens and the topic words
        overlap_count = sum(1 for word in transcript_tokens if word in topic_words)
        scores[reason] = overlap_count
    
    # Find the call reason with the maximum overlap (highest score)
    predicted_reason = max(scores, key=scores.get)
    return predicted_reason, scores

SyntaxError: invalid syntax (Temp/ipykernel_21288/1563214781.py, line 12)