## Task 5: Text processing by stemming, lemmatization and removing stop words
### Sabeel Khan, CSUID: 2829233, CIS593 - BIG DATA

### Importing necessary libraries: nltk, pandas, PorterStemmer and WordNetLemmatizer

In [23]:
import numpy as np
import pandas as pd 
import string as st
import re
import nltk
from nltk import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sabee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

### Reading my 'tweets.csv' file with just the text

In [8]:
data = pd.read_csv('tweets.csv')
data.head()

Unnamed: 0,text
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...
3,@kseni_cher Sol Projects are going crazy latel...
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...


In [9]:
data.shape

(7157, 1)

### Removing punctuation for cleaner text

In [10]:
def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [11]:
data['removed_punc'] = data['text'].apply(lambda x: remove_punct(x))
data.head()

Unnamed: 0,text,removed_punc
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...


### Tokeninzing our text by using the split method

In [12]:
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [13]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

Unnamed: 0,text,removed_punc,tokens
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,..."
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar..."
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf..."
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ..."
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,..."


### This method is for removing all the small words

In [14]:
def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [15]:
data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

Unnamed: 0,text,removed_punc,tokens,larger_tokens
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,...","[printergobrrr, lightningnetwork, bitcoin, tra..."
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar...","[blockworks, meta, filed, trademarks, applicat..."
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf...","[steampad, unveiling, most, powerful, characte..."
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ...","[ksenicher, projects, going, crazy, lately📈, b..."
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,...","[d3protocol, betero, platform, easy, crypto, s..."


### One of they key methods required for this task - 'Removing stop words'

In [16]:
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [18]:
data['stop_words'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

Unnamed: 0,text,removed_punc,tokens,larger_tokens,clean_tokens,stop_words
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra..."
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat..."
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf...","[steampad, unveiling, most, powerful, characte...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveiling, powerful, characters, ga..."
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b..."
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s..."


### Performing stemming using the 'PorterStemmer' function

In [19]:
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]

In [20]:
data['stemming_words'] = data['clean_tokens'].apply(lambda wrd: stemming(wrd))
data.head()

Unnamed: 0,text,removed_punc,tokens,larger_tokens,clean_tokens,stop_words,stemming_words
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra..."
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockwork, meta, file, trademark, applic, log..."
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf...","[steampad, unveiling, most, powerful, characte...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveil, power, charact, game, launc..."
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenich, project, go, crazi, lately📈, battlef..."
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easi, crypto, s..."


### Performing Lemmatization using the 'WordNetLemmatizer' function

In [21]:
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [24]:
data['lemmatization_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

Unnamed: 0,text,removed_punc,tokens,larger_tokens,clean_tokens,stop_words,stemming_words,lemmatization_words
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra..."
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockwork, meta, file, trademark, applic, log...","[blockworks, meta, filed, trademark, applicati..."
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf...","[steampad, unveiling, most, powerful, characte...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveil, power, charact, game, launc...","[steampad, unveiling, powerful, character, gam..."
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenich, project, go, crazi, lately📈, battlef...","[ksenicher, project, going, crazy, lately📈, ba..."
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easi, crypto, s...","[d3protocol, betero, platform, easy, crypto, s..."


### These next few codes are for extracting 'cleaner text'

In [25]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [27]:
data['clean_text'] = data['lemmatization_words'].apply(lambda x : return_sentences(x))
data.head()

Unnamed: 0,text,removed_punc,tokens,larger_tokens,clean_tokens,stop_words,stemming_words,lemmatization_words,clean_text
0,RT @Printer_Gobrrr: #LightningNetwork #Bitcoin...,RT PrinterGobrrr LightningNetwork Bitcoin tran...,"[rt, printergobrrr, lightningnetwork, bitcoin,...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...","[printergobrrr, lightningnetwork, bitcoin, tra...",printergobrrr lightningnetwork bitcoin transac...
1,RT @Blockworks_: 🚨 Meta has filed trademarks a...,RT Blockworks 🚨 Meta has filed trademarks appl...,"[rt, blockworks, 🚨, meta, has, filed, trademar...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockworks, meta, filed, trademarks, applicat...","[blockwork, meta, file, trademark, applic, log...","[blockworks, meta, filed, trademark, applicati...",blockworks meta filed trademark application lo...
2,RT @Steampad: 🔥 Unveiling our most Powerful Ch...,RT Steampad 🔥 Unveiling our most Powerful Char...,"[rt, steampad, 🔥, unveiling, our, most, powerf...","[steampad, unveiling, most, powerful, characte...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveiling, powerful, characters, ga...","[steampad, unveil, power, charact, game, launc...","[steampad, unveiling, powerful, character, gam...",steampad unveiling powerful character game lau...
3,@kseni_cher Sol Projects are going crazy latel...,ksenicher Sol Projects are going crazy lately📈...,"[ksenicher, sol, projects, are, going, crazy, ...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenicher, projects, going, crazy, lately📈, b...","[ksenich, project, go, crazi, lately📈, battlef...","[ksenicher, project, going, crazy, lately📈, ba...",ksenicher project going crazy lately📈 battlefo...
4,@d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for...,d3protocol ⚽️ BETERO ⚽️\n\n📲 New platform for ...,"[d3protocol, ⚽️, betero, ⚽️, 📲, new, platform,...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...","[d3protocol, betero, platform, easi, crypto, s...","[d3protocol, betero, platform, easy, crypto, s...",d3protocol betero platform easy crypto sport b...


### Assigning 'r' to the 'clean_text' variable

In [28]:
r = ['clean_text']

### Collecting all the 'cleaned text' to a csv file

In [29]:
for r in words: 
    if not r in stop_words: 
        appendFile = open('text_processing.csv','a', encoding='utf-8') 
        appendFile.write(r)
        appendFile.write("\n")
        appendFile.close()

## Finding the top 10 most frequent topic words in the text

In [31]:
data = pd.read_csv("text_processing.csv") 
from collections import Counter
Counter(" ".join(data["text"]).split()).most_common(10)

[('RT', 4202),
 ('crypto', 751),
 ('&amp;', 671),
 ('Crypto', 492),
 ('I', 465),
 ('#Bitcoin', 447),
 ('-', 446),
 ('#Crypto', 431),
 ('project', 402),
 ('The', 396)]