In [None]:
!pip install contractions

In [1]:
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('words')

In [2]:
import re
import pickle
import numpy as np
import pandas as pd
import contractions
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [3]:
df_1 = pd.read_csv("../Dataset/csv/train2.csv", header=0, index_col=0)
df_1.dropna(inplace = True)
#print(df_1.isnull().sum(axis = 0))

#Expanding Contraction, Lower Case and Word Splitting
df_1['tweet'] = df_1['tweet'].apply(lambda x: [contractions.fix(word, slang=False).lower() for word in x.split()])

#Removing Punctuations
df_1['tweet'] = df_1['tweet'].apply(lambda x: [re.sub(r'[^\w\s]','', word)  for word in x])

#Removing Stop Words
stop_words = set(stopwords.words('english'))
df_1['tweet'] = df_1['tweet'].apply(lambda x: [word for word in x if word not in stop_words])

#Removing Special Charecter and Numbers
df_1['tweet'] = df_1['tweet'].apply(lambda x: [word for word in x if re.search("[@_!#$%^&*()<>?/|}{~:0-9]", word) == None])

#Removing Non-English Words
english_words = set(nltk.corpus.words.words())
df_1['tweet'] = df_1['tweet'].apply(lambda x: [word for word in x if word in english_words])

#Concating Words back to Sentence
df_1['tweet'] = df_1['tweet'].apply(lambda x: ' '.join(x))

#Transforming Labels
df_1['label'] = df_1['label'].apply(lambda x: 1 if (x == 'real') else 0)

print(df_1.head(3))

                                                tweet  label
id                                                          
1   currently general death different small explic...      1
2                            small rise last southern      1
3   politically correct woman almost pandemic excu...      0


In [4]:
df_2 = pd.read_csv("../Dataset/csv/test2.csv", header=0, index_col=0)
df_2.dropna(inplace = True)
#print(df_2.isnull().sum(axis = 0))

#Expanding Contraction, Lower Case and Word Splitting
df_2['tweet'] = df_2['tweet'].apply(lambda x: [contractions.fix(word, slang=False).lower() for word in x.split()])

#Removing Punctuations
df_2['tweet'] = df_2['tweet'].apply(lambda x: [re.sub(r'[^\w\s]','', word)  for word in x])

#Removing Stop Words
stop_words = set(stopwords.words('english'))
df_2['tweet'] = df_2['tweet'].apply(lambda x: [word for word in x if word not in stop_words])

#Removing Special Charecter and Numbers
df_2['tweet'] = df_2['tweet'].apply(lambda x: [word for word in x if re.search("[@_!#$%^&*()<>?/|}{~:0-9]", word) == None])

#Removing Non-English Words
english_words = set(nltk.corpus.words.words())
df_2['tweet'] = df_2['tweet'].apply(lambda x: [word for word in x if word in english_words])

#Concating Words back to Sentence
df_2['tweet'] = df_2['tweet'].apply(lambda x: ' '.join(x))

#Transforming Labels
df_2['label'] = df_2['label'].apply(lambda x: 1 if (x == 'real') else 0)

#print(df_2.head(3))

In [5]:
tfidfVectorizer = TfidfVectorizer(use_idf=True,stop_words='english')
tfidfVectorizer.fit(df_1['tweet'])  
tfidfVectorizer.fit(df_2['tweet']) 
#print(tfidfVectorizer.get_feature_names())

trainTfidfVector = tfidfVectorizer.transform(df_1['tweet']) 
testTfidfVector = tfidfVectorizer.transform(df_2['tweet']) 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_1['tweet'])
tokenizer.fit_on_texts(df_2['tweet'])

trainSequence = np.array([np.array(seq) for seq in pad_sequences(tokenizer.texts_to_sequences(df_1['tweet']))])
testSequence = np.array([np.array(seq) for seq in pad_sequences(tokenizer.texts_to_sequences(df_2['tweet']))])

pickle.dump(tfidfVectorizer, open("../Dataset/tfidf_vectorizer-2.pickle", "wb"))
pickle.dump(tokenizer, open("../Dataset/tokenizer-2.pickle", "wb"))

pickle.dump(trainTfidfVector, open("../Dataset/tfidf_train-2.pickle", "wb"))
pickle.dump(trainSequence, open("../Dataset/sequence_train-2.pickle", "wb"))
pickle.dump(df_1['label'], open("../Dataset/label_train-2.pickle", "wb"))

pickle.dump(testTfidfVector, open("../Dataset/tfidf_test-2.pickle", "wb"))
pickle.dump(testSequence, open("../Dataset/sequence_test-2.pickle", "wb"))
pickle.dump(df_2['label'], open("../Dataset/label_test-2.pickle", "wb"))

In [6]:
tfidfVectorizer = pickle.load(open("../Dataset/tfidf_vectorizer-2.pickle", "rb"))
#print(tfidfVectorizer.get_feature_names())

tokenizer = pickle.load(open("../Dataset/tokenizer-2.pickle", "rb"))

trainTfidfVector = pickle.load(open("../Dataset/tfidf_train-2.pickle", "rb"))
df_train = pd.DataFrame(data = trainTfidfVector.toarray(),columns = tfidfVectorizer.get_feature_names())
#print(df_train)

testTfidfVector = pickle.load(open("../Dataset/tfidf_test-2.pickle", "rb"))
df_test = pd.DataFrame(data = testTfidfVector.toarray(),columns = tfidfVectorizer.get_feature_names())
#print(df_test)

trainSequence = pickle.load(open("../Dataset/sequence_train-2.pickle", "rb"))
#print(trainSequence)

testSequence = pickle.load(open("../Dataset/sequence_test-2.pickle", "rb"))
#print(testSequence)

trainLabels = pickle.load(open("../Dataset/label_train-2.pickle", "rb"))
#print(trainLabels)

testLabels = pickle.load(open("../Dataset/label_test-2.pickle", "rb"))
#print(testLabels)