In [None]:
import pandas as pd 
import numpy as np 
import scipy as sp 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk import word_tokenize
from tw_preprocessor import TwitterPreprocessor
import matplotlib.pyplot as plt 

In [None]:
file = "data/train_tweets.txt"
temp = []
with open(file, 'r') as data:
    for line in data:
        row = []
        line = line.replace('\t'," ")
        elem = line.strip().split(" ")
        row.append(elem[0])
        row.append(" ".join(elem[1:]))
        temp.append(row) 

In [None]:
tw = pd.DataFrame(temp,columns = ["User","Tweet"])

In [None]:
length = tw['Tweet'].str.len()

plt.hist(length, bins=20, label="tweets")
plt.legend()
plt.show()

In [None]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def text_process(text): 
    tk = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case = False) 
    text = tk.tokenize(text)
    
    text = [word for word in text if word not in STOPWORDS]
    text = ' '.join(lemmatize(word) for word in text)
    
    return text

In [None]:
def preprocess(tw):
    tw['Tweet'].replace("(@[A-Za-z0-9]+)","",regex=True,inplace=True)
    tw['Tweet'].replace("(RT|rt|FAV|fav|VIA|via)","",regex=True,inplace=True)
    tw['Tweet'] = tw['Tweet'].str.lower()
    tw["Tweet"].replace("(\\r|)\\n$", '', regex=True,inplace=True)
    tw["Tweet"].replace(r'http.?://[^\s]+[\s]?','', regex=True,inplace=True)
    tw['Tweet'] = tw['Tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    return tw

In [None]:
tw.head(10)

In [None]:
tw = preprocess(tw)

In [None]:
tw['Tweet'] = tw["Tweet"].apply(text_process)

In [None]:
tw.sample(20)

In [None]:
min_no_tweets = 1
threshold = 150

In [None]:
def sample_data(tw):
    cnt_user = tw['User'].value_counts()
    print(cnt_user.describe())
    df = pd.DataFrame(cnt_user)
    top_user = df[df['User'] >= min_no_tweets].index.tolist()
    top_k = tw[tw.User.isin(top_user)]
    data = top_k['User'].value_counts()
    print(data.describe())
    Tweet = top_k.groupby('User',group_keys=False).apply(lambda x: x.sample(n = min(threshold,len(x))))
    Tweet.sample(10)
    tw = Tweet
    tw['num_of_words'] = tw["Tweet"].str.split().apply(len)
    tw.drop(tw[tw.num_of_words<3].index, inplace=True)
    return tw

In [None]:
tw = sample_data(tw)

In [None]:
vis = tw["User"].value_counts()
print(vis.describe())
print(tw.shape)

In [None]:
length = tw['Tweet'].str.len()

plt.hist(length, bins=20, label="tweets")
plt.legend()
plt.show()

In [None]:
tw.sample(20)

In [None]:
def rename_user(user):
    user = f'__label__{user}'
    return user

In [None]:
tw['User'] = tw['User'].apply(rename_user)

In [None]:
tw.sample(15)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tw.Tweet,tw.User, random_state=0)

In [None]:
def write_file(X,y,filename):
    with open(filename, 'w') as writeFile:
        for user,tweet in zip(y,X):
            writeFile.write(f'{user} {tweet}\n')
    

In [None]:
write_file(X_train,y_train,'trainData.train')
write_file(X_test,y_test,'testData.valid')

In [None]:
import fasttext

In [None]:
%%time
model = fasttext.train_supervised(
    input="trainData.train", 
    lr=0.125, 
    epoch=150,  
    wordNgrams=3,  
    dim=300, 
    ws = 25,
    bucket = 5000000,
    minCount = 1,
    loss='ova')

In [None]:
%%time
preds = []
for tweet in X_test.tolist():
    preds.append(model.predict(tweet,k=1)[0][0])
    
preds = pd.Series(preds)

In [None]:
print('Accuracy: ', metrics.accuracy_score(y_test, preds))

In [None]:
def prepare_test_data():
    file1 = "data/test_tweets_unlabeled.txt"
    with open(file1, 'r') as data:
        temp = [line for line in data]    
    unlabel = pd.DataFrame(temp,columns = ["Tweet"])
    unlabel = clean_df(unlabel)
    unlabel = unlabel['Tweet'].tolist()
    return unlabel
    
def submission_file(data):
    import csv
    with open('predicted.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(['Id','Predicted'])
        for index,tweet in enumerate(data):
            pred = model.predict(tweet,k =1)[0][0]
            pred = re.findall("(\d+)", pred)[0]
            writer.writerow([index+1,pred])

In [None]:
%%time
data = prepare_test_data()
submission_file(data)