# 2. VECTORIZATION

## 2.1 Imports

In [1]:
from google.colab import files
from google.colab import drive
import pandas as pd
import nltk
import io
import re
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize import word_tokenize

nltk.download("popular")
nltk.download('vader_lexicon') #sentiment analysis
nltk.download('twython') #twitter

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do



In [2]:
drive.mount('/drive')

Mounted at /drive


## 2.2 Importing dataset

The dataset that it is going to take for this phase is called **processed_tweets.csv**

In [3]:
def upload_dataframes (index_fields):
  uploaded = files.upload()
  for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=fn, length=len(uploaded[fn])))
    df = pd.read_csv(io.StringIO(uploaded[fn].decode('utf-8')), index_col = index_fields)
    return df

In [4]:
tweets = upload_dataframes([])

Saving processed_tweets.csv to processed_tweets.csv
User uploaded file "processed_tweets.csv" with length 1538093 bytes


In [5]:
print(tweets.shape)
tweets.head()

(24783, 7)


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,it woman complain cleaning house man always ta...
1,1,3,0,3,0,1,it boy day coldtyga down bad coffin dat he st ...
2,2,3,0,3,0,1,it dawn it ever fuck bitch start cry confused ...
3,3,3,0,2,1,1,it look like tyranny
4,4,6,0,6,0,1,it shit hear might true might baker bitch told...


## 2.3 TFIDF

In [6]:
def tfidf_vectorizer(matrix):
  vectorizer = TfidfVectorizer(min_df=1, lowercase=False)
  return vectorizer.fit_transform(matrix)

In [7]:
tweets_tfidf = tfidf_vectorizer(tweets['tweet'])
tweets_tfidf.shape

(24783, 13621)

In [11]:
pd.DataFrame(tweets_tfidf.toarray()).to_csv('/drive/My Drive/tweets_tfidf.csv')

KeyboardInterrupt: ignored

## 2.3 TFIDF with N-grams

In [7]:
def tfidf_ngram_vectorizer(matrix):
  vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=1, lowercase=False)
  return vectorizer.fit_transform(matrix)

In [8]:
tweets_ngrams = tfidf_ngram_vectorizer(tweets['tweet'])
tweets_ngrams.shape

(24783, 124570)

## 2.3 TFIDF with N-grams and POS-tagging

In [10]:
def tokenize_pos(raw):
  tokens = word_tokenize(raw)
  tags = nltk.pos_tag(tokens)
  raw_tags = " ".join(tag for (word, tag) in tags)
  return raw_tags

In [11]:
tweets_tokenize = tweets['tweet'].apply(tokenize_pos)
tweets_tokenize.head()

0          PRP NN VBZ VBG NN NN RB VBP NN
1    PRP VBD NN VB RP JJ NN NN PRP VBD NN
2      PRP VBZ PRP RB VBD JJ NN NN VBD NN
3                           PRP VBP IN NN
4     PRP VBD JJ MD JJ MD VB NN VBD DT DT
Name: tweet, dtype: object

In [12]:
tweets_ngrams_pos = tfidf_ngram_vectorizer(tweets_tokenize)
tweets_ngrams_pos.shape

(24783, 657)

## 2.4 TFIDF with N-grams, POS-tagging and other features.

### Number of RT's

In [15]:
def num_rt(raw):
  rt = 0
  words = raw.split()
  for word in words:
    if word == 'rt':
      rt += 1
  return rt

In [16]:
tweets_rt = tweets['tweet'].apply(num_rt)
tweets_rt.head()

0    0
1    0
2    0
3    0
4    0
Name: tweet, dtype: int64

###Sentiment Analysis

In [17]:
def sentiment_analysis(raw):
  sentiment_analyzer  = SentimentIntensityAnalyzer() 
  sentiment = sentiment_analyzer.polarity_scores(raw)
  return [value for key, value in sentiment.items()]

In [16]:
tweets_sentiment = tweets['tweet'].apply(sentiment_analysis)
tweets_sentiment.head()

0     [0.238, 0.762, 0.0, -0.3612]
1     [0.259, 0.741, 0.0, -0.5423]
2      [0.765, 0.235, 0.0, -0.946]
3      [0.0, 0.545, 0.455, 0.3612]
4    [0.457, 0.37, 0.173, -0.6808]
Name: tweet, dtype: object

###Hatred N-gram dictionary

In [19]:
hatred_dictionary = upload_dataframes([])

Saving refined_ngram_dict.csv to refined_ngram_dict.csv
User uploaded file "refined_ngram_dict.csv" with length 3178 bytes


In [20]:
hatred_dictionary.head()

Unnamed: 0,ngram,prophate
0,allah akbar,0.87
1,blacks,0.583
2,chink,0.467
3,chinks,0.542
4,dykes,0.602


In [21]:
def hatred_dict_analysis(raw):
  weights = []
  for i in hatred_dictionary.index:
    if re.search(hatred_dictionary['ngram'][i], raw) is None:
      weights.append(0.0)
    else:
      weights.append(hatred_dictionary['prophate'][i])
  return weights

In [22]:
tweets_hatred_dict = tweets['tweet'].apply(hatred_dict_analysis)
tweets_hatred_dict.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: tweet, dtype: object

##2.5 Unify all configurations

In [None]:
configurations = pd.DataFrame(np.concatenate(tweets_tfidf.toarray(), tweets_ngrams.toarray()))
configurations.head()