#### Importing the libraries required

In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("D:\\Interviews\\eClerx\\twcs\\twcs.csv",encoding='latin1')

In [3]:
df.shape

(1048575, 7)

#### Replacing null values with -1000

In [4]:
df.fillna(-1000, inplace = True)

In [5]:
df.drop(columns=["created_at"], inplace = True)

#### Checking the parent tweet/message i.e. those messages which have no in_response_to_tweet_id

In [6]:
dfMainTweets = df[df["in_response_to_tweet_id"] == -1000.0]

In [7]:
dfMainTweets.shape

(280576, 6)

#### Using the relationship of tweet id and in_response_to_tweet_id, filtering the inbound = True messages so that a user's entire tweet is in one place

In [None]:
lstTweetContent = []


cnt = 1
for x in dfMainTweets.values:
    
    tweet_id = x[0]
    content = x[3]
    boolInbound = False
    
    print(cnt)
    cnt = cnt + 1
    
    while(1==1):
        dfTemp = df[df["in_response_to_tweet_id"] == tweet_id]
        
        if len(dfTemp) ==0:
            break
        
        boolInbound = dfTemp["inbound"].values[0]
        
        if boolInbound == True:
            content = content + " " + dfTemp["text"].values[0]
            boolInbound = False
        
        tweet_id = dfTemp["tweet_id"].values[0]
            
        
    lstTweetContent.append(content)
    pass

#### The merged tweets was stored in a local path and is used for further analysis

In [12]:
#dfMergedTweets = pd.DataFrame({"text" : lstTweetContent})
dfMergedTweets = pd.read_csv("D:\\Interviews\\eClerx\\Output\\merged_tweets.csv",encoding='latin1')

In [13]:
dfMergedTweets = dfMergedTweets[["text"]]
dfMergedTweets.head()

Unnamed: 0,text
0,@sprintcare is the worst customer service @spr...
1,@115714 yÃ¢ÂÂall lie about your Ã¢ÂÂgreatÃ...
2,"@115714 whenever I contact customer support, t..."
3,actually that's a broken link you sent me and ...
4,"Yo @Ask_Spectrum, your customer service reps a..."


#### Remove the @username from each tweet

In [18]:
dfMergedTweets["text"] = [" ".join(filter(lambda x:x[0]!='@', str(a).split())) for a in dfMergedTweets["text"]]

#### Replacing the unicode characters with their english counterparts

In [20]:
dfMergedTweets["text"] = [(x.encode('ascii', 'ignore')).decode("utf-8") for x in dfMergedTweets["text"]]

In [21]:
dfMergedTweets

Unnamed: 0,text
0,is the worst customer service I did. I have se...
1,yall lie about your great connection. 5 bars L...
2,"whenever I contact customer support, they tell..."
3,actually that's a broken link you sent me and ...
4,Yo your customer service reps are super nice b...
...,...
280662,"Hi, How can I get back to IOS10 in my iPhone 6..."
280663,Hey #Apple thanks for not warning me that the ...
280664,I need help with my iCloud Photo Library Why d...
280665,When does you good https://t.co/6IEZZBMhmn


#### Text to Numbers

In [22]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words="english")
dtm = cv.fit_transform(dfMergedTweets["text"])

#### LDA with 10 topics

In [23]:
LDA = LatentDirichletAllocation(n_components= 10, random_state = 42)
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

#### Checking the most frequent words in the 10 topics

In [24]:
for i, topic in enumerate(LDA.components_):
    print(f"Top 15 words for Topic {i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print("\n")

Top 15 words for Topic 0
['people', 'dm', 'customers', 'amp', 'don', 'worst', 'like', 'im', 'time', 'guys', 'phone', 'just', 'https', 'customer', 'service']


Top 15 words for Topic 1
['fargo', 'wells', 'lo', 'para', 'ya', 'da', 'se', 'por', 'es', 'mi', 'https', 'en', 'la', 'el', 'que']


Top 15 words for Topic 2
['https', 'euston', 'just', 'ticket', 'trains', 'going', 'today', 'staff', 'time', 'internet', 'london', 'customer', 'thanks', 'train', 'service']


Top 15 words for Topic 3
['ne', 'il', 'ai', 'vous', 'mon', 'pour', 'en', 'que', 'et', 'la', 'pas', 'est', 'je', 'le', 'https']


Top 15 words for Topic 4
['phone', 'app', 'dm', 'sent', 'trying', 'just', 'https', 'thanks', 'number', 'need', 'email', 'hi', 'card', 'help', 'account']


Top 15 words for Topic 5
['minutes', 'waiting', 'just', 'ordered', 'prime', 'time', 'days', 'https', 'today', 'package', 'delivered', 'day', 'amazon', 'delivery', 'order']


Top 15 words for Topic 6
['working', 'watch', 'tv', 'apple', 'fix', 'internet'

#### The following are the topics which the tweets can be clustered/grouped into:

1. Customer service calls
2. Train ticketing queries
3. Amazon delivery queries
4. Apple products service requests
5. Christmas food ordering
6. Flight related queries
7. Spotify/XBox issues and services