# Feature Extraction

In [1]:
import pandas as pd
import re
from sklearn import preprocessing


def cleanData(text):
    removeList = ["@handle","@handle:"]
    text_r = " ".join([char for char in text.split(' ') if char not in removeList and not any(i.isdigit() for i in char) and char.isdigit]) 
    return text_r

def removeHyperLink(text):
    text,count = re.subn(r'http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    return(text,count)
    
def removeSpChar(text):
    new = re.sub('[@_!#$%^&*()<>?/\|}{~:\']' ,'', text)
    count = len(text) - len(new)
    return new,count

def removeRT(text):
    text_r = " ".join([char for char in text.split(' ') if char != 'RT'])
    return text_r


##Normalization

# def normalizeDF(df_orig):
#     normalizeColumn(df_orig,'chCount')
#     normalizeColumn(df_orig,'wdCount')
#     normalizeColumn(df_orig,'spCount')
#     #normalizeColumn(df_orig,'isCap')
#     normalizeColumn(df_orig,'hashtags')
#     return df_orig

    
## Tokenize and stem data

import nltk
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords


def tokenize(text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = nltk.word_tokenize(text)
    filtered_sentence = [] 
    for w in word_tokens: 
        #if w not in stop_words: 
        filtered_sentence.append(w) 
    stems = []
    for item in filtered_sentence:
        stems.append(PorterStemmer().stem(item))
    return ' '.join(word for word in stems)

def preprocessingDF(df_orig):
    df_orig['tweet'] = df_orig['rtweet'].apply(lambda x: cleanData(x))
    df_orig['tweet'],df_orig['NumLink'] = zip(*df_orig['tweet'].map(removeHyperLink))
    df_orig['hashtags'] = df_orig['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    df_orig['tweet'],df_orig['spCount'] =  zip(*df_orig['tweet'].map(removeSpChar))
    df_orig['wdCount'] = df_orig['tweet'].apply(lambda x: len(str(x).split(" ")))
    df_orig['chCount'] = df_orig['tweet'].str.len()
    df_orig['isCap'] = df_orig['tweet'].apply(lambda x: len([x for x in x.split() if x[0].isupper()]))
    df_orig['isRT'] = df_orig['tweet'].apply(lambda x: 1 if 'RT' in x else 0)
    df_orig['tweet'] = df_orig['tweet'].apply(lambda x: removeRT(x))
    df_orig['tweet'] = df_orig['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df_orig['tweet'] = df_orig['tweet'].apply(lambda x: tokenize(x))
    return df_orig


def trainRT(df_orig):
    ## Rate of RT
    #df_orig['atweet'] = df_orig.groupby("id")['tweet'].transform(lambda x: ' '.join(x))
    df_orig['TotTw'] = df_orig.groupby("id")['tweet'].transform('count')
    df_orig['RTrate'] = df_orig.groupby("id")['isRT'].transform('sum')
    df_orig['RTrate'] = df_orig['RTrate']/df_orig['TotTw']
    return df_orig

def testRT(df_orig):
    df_orig['RTrate'] = df_orig['isRT']
    return df_orig

def dropFeatures(df_orig):
    return df_orig.drop(columns=["NumLink","spCount","hashtags","chCount","isCap","isRT","wdCount"])


In [2]:
df_original_tweet = pd.read_table('../data/train_tweets.txt')
#df_original_tweet = preprocessing(df_original_tweet)

#df_train_tweet = normalizeDF(df_train_tweet)

  """Entry point for launching an IPython kernel.


In [3]:
#df_original_tweet

# Model Training

In [6]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df_tweet = df_original_tweet.iloc[:, 1:].values
y = df_original_tweet.iloc[:,0].values

encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
# encoded_Y
sentences_train, sentences_test, y_train, y_test = train_test_split(df_tweet, encoded_Y, test_size=0.20,random_state=500)



In [7]:
df_train = pd.DataFrame(sentences_train,columns=["rtweet"])
df_train = preprocessingDF(df_train)



In [8]:

wdCount_scaler = preprocessing.MinMaxScaler()
wdCount = df_train['wdCount'].values #returns a numpy array

wdCount = wdCount.reshape(-1, 1)

wdCount_scaler.fit(wdCount)
wdCount_scaled = wdCount_scaler.transform(wdCount)
df_train['wdCount'] = pd.DataFrame(wdCount_scaled)



chCount_scaler = preprocessing.MinMaxScaler()
chCount = df_train['chCount'].values #returns a numpy array
chCount = chCount.reshape(-1, 1)

chCount_scaler.fit(chCount)
chCount_scaled = chCount_scaler.transform(chCount)
df_train['chCount'] = pd.DataFrame(chCount_scaled)




In [9]:
#df_train = dropFeatures(df_train)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(df_train['tweet'].values)

X_bow_train = vectorizer.transform(df_train['tweet'].values)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_vecs = tfidf.fit(df_train['tweet'].values)

X_tfidf_train = tfidf_vecs.transform(df_train['tweet'].values)
X_tfidf_train





<262555x92911 sparse matrix of type '<class 'numpy.float64'>'
	with 2896777 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.decomposition import TruncatedSVD

svd_tfidf = TruncatedSVD(n_components=1000)
svd_tfidf.fit(X_tfidf_train)

matrix_tfidf_train_lowrank = svd_tfidf.transform(X_tfidf_train)

#matrix_tfidf_train_lowrank

In [13]:
df_tfidf_train = pd.DataFrame(matrix_tfidf_train_lowrank)


In [14]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=1000)
svd.fit(X_bow_train)

matrix_train_lowrank = svd.transform(X_bow_train)

df_bow_train = pd.DataFrame(matrix_train_lowrank)
combDF_bog_train = pd.concat([df_train.drop(columns=["rtweet","tweet"]), df_bow_train], axis=1)
#combDF_bog_train


In [15]:
combDF_train = pd.concat([combDF_bog_train, df_tfidf_train], axis=1)
#combDF_train

In [16]:
combDF_train.shape

(262555, 2007)

In [17]:
# ## To store


# import pickle

# def storeModel(model,filename):
#     pickle.dump(model, open(filename, 'wb'))
    
# SVD = "SVD_1000_v2"
# storeModel(svd,SVD)


In [18]:
####
## Preparing testing data
#####

df_test = pd.DataFrame(sentences_test,columns=["rtweet"])
df_test = preprocessingDF(df_test)
#df_test = dropFeatures(df_test)



In [19]:

# normalizing test data


wdCount_test = df_test['wdCount'].values #returns a numpy array
wdCount_test = wdCount_test.reshape(-1, 1)
wdCount_scaled_test = wdCount_scaler.transform(wdCount_test)
df_test['wdCount'] = pd.DataFrame(wdCount_scaled_test)



chCount_test = df_test['chCount'].values #returns a numpy array
chCount_test = chCount_test.reshape(-1, 1)

chCount_scaled_test = chCount_scaler.transform(chCount_test)
df_test['chCount'] = pd.DataFrame(chCount_scaled_test)


#df_test


In [20]:
X_bow_test = vectorizer.transform(df_test['tweet'].values)
matrix_test_lowrank = svd.transform(X_bow_test)

df_bow_test = pd.DataFrame(matrix_test_lowrank)
combDF_test = pd.concat([df_test.drop(columns=["rtweet","tweet"]), df_bow_test], axis=1)
#combDF_test


In [21]:
X_tfidf_test = tfidf_vecs.transform(df_test['tweet'].values)


matrix_tfidf_test_lowrank = svd_tfidf.transform(X_tfidf_test)

df_tfidf_test = pd.DataFrame(matrix_tfidf_test_lowrank)
combDF_test_total = pd.concat([combDF_test, df_tfidf_test], axis=1)
combDF_test_total




Unnamed: 0,NumLink,hashtags,spCount,wdCount,chCount,isCap,isRT,0,1,2,...,990,991,992,993,994,995,996,997,998,999
0,0,0,1,0.004773,0.007016,0,0,0.192994,-0.034884,0.099322,...,0.002936,0.000831,0.001480,0.001998,-0.000662,0.001764,0.000816,-0.000357,-0.000204,0.002191
1,1,0,0,0.033413,0.042095,5,0,0.517067,0.093948,0.278472,...,-0.047708,-0.025545,0.006834,0.046583,-0.000862,0.030476,-0.009843,0.005496,0.022838,0.008453
2,1,0,2,0.026253,0.028064,7,0,1.558949,-1.392155,-0.359748,...,0.022637,-0.017494,0.010043,0.014480,0.006798,-0.009007,0.000924,-0.001123,-0.011042,0.011405
3,1,0,3,0.040573,0.041160,9,0,0.737812,0.206194,0.737023,...,0.002685,0.007097,-0.002027,-0.015345,0.002407,-0.010998,-0.006199,0.000233,0.000223,-0.005848
4,0,0,0,0.035800,0.040225,3,0,1.853763,0.415569,0.212842,...,0.004941,-0.004954,0.002754,0.000551,0.002863,-0.000790,-0.002180,0.000387,-0.003505,0.000541
5,0,0,3,0.054893,0.051450,7,0,2.521867,1.474270,0.239612,...,-0.009460,0.007135,0.027583,-0.003945,-0.019125,-0.001194,-0.005433,-0.018909,0.006435,0.013304
6,0,0,1,0.019093,0.022919,8,0,0.368952,0.384042,0.686889,...,-0.002683,0.000581,-0.000716,-0.002572,0.000543,-0.002665,-0.005379,-0.001408,-0.001255,0.000553
7,0,0,1,0.014320,0.020112,0,0,0.107845,0.049721,0.083212,...,0.001302,-0.001838,-0.000244,-0.002563,-0.014278,-0.004763,0.008917,0.000227,0.016105,0.001629
8,0,0,1,0.038186,0.038354,0,0,1.023509,-0.041828,0.649776,...,-0.015315,-0.006243,0.003596,0.008603,-0.010224,0.015707,-0.010409,-0.003530,0.005275,0.001253
9,0,0,3,0.028640,0.032741,0,0,0.346484,0.120026,0.339969,...,-0.000388,-0.003235,0.000079,-0.004406,-0.001949,0.007234,-0.003761,-0.000397,0.005012,0.005738


In [22]:
combDF_test_total.shape

(65639, 2007)

In [23]:
X_train = combDF_train.values
X_test = combDF_test_total.values

input_dim = X_train.shape[1]



In [29]:
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.optimizers import SGD

model = Sequential()
# model.add(Dropout(0.2, input_shape=(input_dim,)))
# model.add(Dense(1000, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
# model.add(Dense(100, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
# model.add(Dense(30, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
# model.add(Dense(250, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))

model.add(Dense(1000, input_dim=input_dim, activation='relu')) # input dimension = dimension of festure vector
model.add(Dense(100, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(9293, activation='softmax')) # output layer = no. of classes

#sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.metrics_names)

['loss', 'acc']


In [30]:
batch_size = 1000
epochs = 20
history = model.fit(X_train,y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test,y_test))



Train on 262555 samples, validate on 65639 samples
Epoch 1/20


KeyboardInterrupt: 

In [27]:
batch_size = 500
epochs = 30
history = model.fit(X_train,y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test,y_test))



Train on 262555 samples, validate on 65639 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [28]:
# batch_size = 500
# epochs = 300
# history = model.fit(X_train,y_train,
#                     batch_size=batch_size,
#                     epochs=epochs,
#                     verbose=1,
#                     validation_data=(X_test,y_test))



Train on 262555 samples, validate on 65639 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300


Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300


Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300


Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300


Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300

KeyboardInterrupt: 

In [None]:
#train_histt = model.fit(X_train, y_train, epochs=100, batch_size=256, verbose=1)


In [None]:
import matplotlib.pyplot as plt
#plt.plot(history.history['loss'])
plt.plot(history.history['acc'])
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.show()

In [None]:
scores = model.evaluate(X_test, y_test, batch_size=32)
print('Test loss=%s'% scores[0])
print('Test accuracy=%s'% scores[1])

In [None]:
# ANN_MODEL = "ANN_MODEL_v4"
# VECTORIZER = "Vectorizer_v4"
# ENCODER = "ENCODER_v4"


In [None]:

# storeModel(model,ANN_MODEL)
# storeModel(vectorizer,VECTORIZER)
# storeModel(encoder,ENCODER)


In [None]:
# import pickle

# def loadModel(filename):
#     loaded_model = pickle.load(open(filename, 'rb'))
#     return loaded_model

# vectorizer = loadModel(VECTORIZER)
# encoder = loadModel(ENCODER)
# model = loadModel(ANN_MODEL)
# svd = loadModel(SVD)

# Testing

In [None]:
def readFile(path):
    with open(path) as f:
        lineList = f.readlines()
    return lineList

In [None]:
test_data = readFile('../data/test_tweets_unlabeled.txt')
df_test = pd.DataFrame(test_data,columns = ['rtweet'])


In [None]:
df_predict = preprocessingDF(df_test)
#df_test = dropFeatures(df_test)


In [None]:

# normalizing test data


wdCount_predict = df_predict['wdCount'].values #returns a numpy array
wdCount_predict = wdCount_predict.reshape(-1, 1)
wdCount_scaled_predict = wdCount_scaler.transform(wdCount_predict)
df_predict['wdCount'] = pd.DataFrame(wdCount_scaled_predict)



chCount_predict = df_predict['chCount'].values #returns a numpy array
chCount_predict = chCount_predict.reshape(-1, 1)

chCount_scaled_predict = chCount_scaler.transform(chCount_predict)
df_predict['chCount'] = pd.DataFrame(chCount_scaled_predict)


#df_predict


In [None]:
X_bow_predict = vectorizer.transform(df_predict['tweet'].values)
matrix_predict_lowrank = svd.transform(X_bow_predict)

df_bow_predict = pd.DataFrame(matrix_predict_lowrank)
combDF_predict = pd.concat([df_predict.drop(columns=["rtweet","tweet"]), df_bow_predict], axis=1)
combDF_predict


In [None]:
X_tfidf_predict = tfidf_vecs.transform(df_predict['tweet'].values)


matrix_tfidf_predict_lowrank = svd_tfidf.transform(X_tfidf_predict)

df_tfidf_predict = pd.DataFrame(matrix_tfidf_predict_lowrank)
combDF_predict_total = pd.concat([combDF_predict, df_tfidf_predict], axis=1)
combDF_predict_total




In [None]:
# test_sentences  = vectorizer.transform(df_test['tweet'])
# matrix_test_lowrank = svd.transform(test_sentences)
# count_test_df = pd.DataFrame(matrix_test_lowrank)

# df_test

In [None]:
combDF_predict_total.shape

In [None]:

# ## Concatenate 
# df2_test = df_test.drop(columns=["rtweet","tweet"])
# combined_test_df = pd.concat([df2_test, count_test_df], axis=1)
# combined_test_df


In [None]:
X_testdata = combDF_predict_total.iloc[:, 0:].values

y_testdata = model.predict_classes(X_testdata)
y_output = encoder.inverse_transform(y_testdata)


result = []
for i in range(0,len(y_output)):
    result.append(y_output[i])
print(len(result))
resDF = pd.DataFrame(result)

In [None]:
resDF.to_csv("../features/unlabelled_prediction_v18.csv")

In [None]:
#!kaggle competitions submit -c whodunnit -f submission.csv -m "Message"

# LSTM model

In [None]:
# modelLSTM = Sequential()
# modelLSTM.add(Embedding(MAX_FEATURES, 128))
# modelLSTM.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2,
#                activation='tanh', return_sequences=True))
# modelLSTM.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2, activation='tanh'))
# modelLSTM.add(Dense(number_classes, activation='sigmoid'))
# modelLSTM.compile(loss='categorical_crossentropy', optimizer = 'rmsprop',
#               metrics=['accuracy'])