## Import necessary files

In [2]:
import pandas as pd
import sklearn
import numpy as np

## Import data

In [3]:
train=pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\DWDM lab project tweet\train.csv")
test=pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\DWDM lab project tweet\test.csv")

## Exploratory Data Analysis

In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
test.tail()

Unnamed: 0,id,tweet
17192,49155,thought factory: left-right polarisation! #tru...
17193,49156,feeling like a mermaid ð #hairflip #neverre...
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...
17195,49158,"happy, at work conference: right mindset leads..."
17196,49159,"my song ""so glad"" free download! #shoegaze ..."


In [6]:
sum(train["label"]==0)

29720

In [7]:
sum(train["label"]==1)

2242

In [8]:
train.isnull().values.any()

False

## Data Cleaning & Preprocessing

In [9]:
#installtweet-preprocessor to clean data
!pip install tweet-preprocessor



In [10]:
# remove special characters using the regular expression library
import re

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

In [11]:
import preprocessor as p

# custom function to clean the dataset (combining tweet_preprocessor and reguar expression)
def clean_tweets(df):
  tempArr = []
  for line in df:
    # send to tweet_processor
    tmpL = p.clean(line)
    # remove puctuation
    tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
    tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
    tempArr.append(tmpL)
  return tempArr

In [12]:
# clean training data
train_tweet = clean_tweets(train["tweet"])
train_tweet = pd.DataFrame(train_tweet)


In [56]:
test_clean=clean_tweets(["hello @,,,"])
print(test_clean)

['hello ']


In [13]:
# append cleaned tweets to the training data
train["clean_tweet"] = train_tweet

# compare the cleaned and uncleaned tweets
train.head(10)

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now
5,6,0,[2/2] huge fan fare and big talking before the...,2 2 huge fan fare and big talking before they ...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so


In [14]:
# clean the test data and append the cleaned tweets to the test data
test_tweet = clean_tweets(test["tweet"])
test_tweet = pd.DataFrame(test_tweet)
# append cleaned tweets to the training data
test["clean_tweet"] = test_tweet

# compare the cleaned and uncleaned tweets
test.tail()

Unnamed: 0,id,tweet,clean_tweet
17192,49155,thought factory: left-right polarisation! #tru...,thought factory left right polarisation &gt3
17193,49156,feeling like a mermaid ð #hairflip #neverre...,feeling like a mermaid
17194,49157,#hillary #campaigned today in #ohio((omg)) &am...,today in omg &amp used words like assets&ampli...
17195,49158,"happy, at work conference: right mindset leads...",happy at work conference right mindset leads t...
17196,49159,"my song ""so glad"" free download! #shoegaze ...",my song so glad free download


# Test and Train split

In [15]:
from sklearn.model_selection import train_test_split

# extract the labels from the train data
y = train.label.values

# use 70% for the training and 30% for the test
x_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y, 
                                                    stratify=y, 
                                                    random_state=1, 
                                                    test_size=0.3, shuffle=True)

# Vectorize tweets using CountVectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
documents = ["This is a class project based on data science",
             "Data science is my passion and it is fun!",
             "I want to make different projects"]

# initializing the countvectorizer
vectorizer = CountVectorizer()

# tokenize and make the document into a matrix
document_term_matrix = vectorizer.fit_transform(documents)

# check the result
pd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())

Unnamed: 0,and,based,class,data,different,fun,is,it,make,my,on,passion,project,projects,science,this,to,want
0,0,1,1,1,0,0,1,0,0,0,1,0,1,0,1,1,0,0
1,1,0,0,1,0,1,2,1,0,1,0,1,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,1


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorize tweets for model building
vectorizer = CountVectorizer(binary=True, stop_words='english')

# learn a vocabulary dictionary of all tokens in the raw documents
vectorizer.fit(list(x_train) + list(x_test))

# transform documents to document-term matrix
x_train_vec = vectorizer.transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [53]:
test_first=["he is a black nigga"]
test_first_vec=vectorizer.transform(test_first)

# Build,Test & Deploy Model

## 1. Support Vector classifier(SVC)

In [36]:
from sklearn import svm
# classify using support vector classifier
svm = svm.SVC(kernel = 'linear', probability=True)

# fit the SVC model based on the given training data
prob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)

# perform classification and prediction on samples in x_test
y_pred_svm = svm.predict(x_test_vec)

In [54]:
y_pred_svm = svm.predict(test_first_vec)
print(y_pred_svm)

[0]


### Accuracy score for SVC

In [29]:
print(x_test)

['off to concelebrate at the for the first time'
 'gotta love when he shows up outta nowhere with a single pink rose'
 'i am adventure' ... '  at tonight w dj  '
 'god i thank you for the of life'
 'the latest the blicqer daily thanks to']


In [21]:
from sklearn.metrics import accuracy_score
print("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

Accuracy score for SVC is:  94.86912086766085 %


## 2. Naive Bayes Classifier

In [63]:
from sklearn.naive_bayes import GaussianNB

In [64]:
model= GaussianNB()

In [66]:
model.fit(x_train_vec.toarray(), y_train)

GaussianNB()

In [70]:
y_pred_naive = model.predict(x_test_vec.toarray())
y_pred_naive

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

### Accuracy score for Naive Bayes classifier

In [72]:
print("Accuracy score for NAive Bayes is: ", accuracy_score(y_test, y_pred_naive) * 100, '%')

Accuracy score for NAive Bayes is:  70.77901762436124 %
