In [1]:
#importing the necessary libraries
import numpy as np
import pandas as pd

In [2]:
#reading the data
df = pd.read_csv('/content/tweets.csv')

In [3]:
#checking the first few rows of the data
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [5]:
#dropping off id column because its irrelevant
df.drop('id',axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...


In [7]:
df.shape

(7920, 2)

In [8]:
#cheking the unique values of column label
df['label'].value_counts()

label
0    5894
1    2026
Name: count, dtype: int64

#data preprocessing

In [9]:
#writing a function to remove the punctuations from text
import string
def remove_punctuation(text):
  punctuationfree = "".join([i for i in text if i not in string.punctuation])
  return punctuationfree

In [10]:
#writing a function to tokenize each word of text
import nltk
nltk.download('punkt')
def tokenization(text):
  words = nltk.word_tokenize(text)
  return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
#writing a function to remove the stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
#writing a function to convert words into lemmas
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [13]:
#wrting the function to process all the above functions
def preprocess(df_col):
  corpus =[]
  for item in df_col:
    new_item = remove_punctuation(item)
    new_item = new_item.lower()
    new_item = tokenization(new_item)
    new_item = remove_stopwords(new_item)
    new_item = lemmatizer(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [14]:
# creating our corpus
corpus = preprocess(df.tweet)

In [15]:
corpus

['fingerprint pregnancy test httpsgooglh1mfqv android apps beautiful cute health igers iphoneonly iphonesia iphone',
 'finally transparant silicon case thanks uncle yay sony xperia sonyexperias… httpinstagramcompyget5jc6jm',
 'love would go talk makememories unplug relax iphone smartphone wifi connect httpfbme6n3lsupcu',
 'im wired know im george made way iphone cute daventry home httpinstagrampli5ujs4k',
 'amazing service apple wont even talk question unless pay 1995 stupid support',
 'iphone software update fucked phone big time stupid iphones',
 'happy u instapic instadaily u sony xperia xperiaz httpsinstagramcompz9qgfwlvj7',
 'new type c charger cable uk httpwwwebaycoukitm112598674021 … bay amazon etsy new year rob cross toby young evemun mcmafia taylor spectre 2018 newyear starting 2018 recipe technology samsunggalaxys9 iphonex pictwittercompjiwq59wtc',
 'bout go shopping listening music iphone justme music likeforlike followforfollow… httpinstagrampvj6bg5tlql',
 'photo fun selfie

# bag of words

In [16]:
#converting the text into numbers, vectors
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
data_x = cv.fit_transform(corpus)
x = data_x
y = df.label

#classification into train and test

In [17]:
#classification of data into train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [18]:
#creating the classifier model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)

In [19]:
#cheking the accuracy of the created model
from sklearn import metrics
y_pred = clf.predict(x_test)
metrics.accuracy_score(y_test, y_pred)

0.842929292929293