In [1]:
# Import libraries here that you need for different processing steps
import nltk
import csv
import spacy
import pandas as pd

## Loading the dataset


In [17]:
# Read the csv file into a dataframe

data_file = "./Dataset/covid.csv"

data_df = pd.read_csv(data_file)
print ("Training set: ", len(data_df))

display(data_df)

## Basic Data Cleaning and Transformations

## Counting Sentiment Labels

In [18]:
# Run the cell to see the distribution of the sentiment label in the dataset

print(data_df.Sentiment.value_counts())

## Dropping Columns
Identify the column that you think is less relevant for text mining or other NLP tasks

In [19]:
data_df.drop(['ScreenName','TweetAt'],axis=1,inplace=True)
print(data_df.shape)
data_df.head(10)

## Handling NULL values
Handle null values in a column by specifying the alternate value

In [20]:
# run this cell to fill all null values in all columns of thedataframe with the desired value. 
# Trying doing it for a single column yourself.

data_df = data_df.fillna("NA")
data_df.head()

## Handling Exceptions
Use try-except-pass

In [21]:
# Incorporate the statements within a try-except block where you suspect there might be errors.
# In the except block, handle the exception according to the requirement.

try:
    data_df=data_df.fillna("NA")
except:
    pass

## Lowercase conversion

In [22]:
# Example showing lower case conversion of Tweet for an instance

print(data_df.OriginalTweet.tolist()[100])
print("\n")
print(data_df.OriginalTweet.tolist()[100].lower())

## Handling Special Characters or links

This should be an interesting step. You can remove special characters or links or anything that does not have value for the sentiment

In [23]:
# Example showing removal of special chars from the Tweet for an instance

import re

print(data_df.OriginalTweet.tolist()[100])
print("\n")
print(re.sub('[^A-Za-z0-9]+', ' ', data_df.OriginalTweet.tolist()[100]))

## TEXT PRE-PROCESSING

The following are the techniques to tranform the data into a cleaner data. Try out the what all techniques you would apply to your textual data to get the best quality dataset for a model.
You can use either/all/addition to these steps mentioned below in any order that you find appropriate.

- Tokenization
- Sentence Segmentation

- Stemming
- Lemmatization
- PartOfSpeech Tagging
- Others


In [24]:
# Run this cell to see an example of text pre-processing including a few of these techniques. 
# This image is a hint to the pre-processing steps but may or may not be the best. 
# It shows how the input text changes with each processing step
# Find out different Tokenizers, Stemmers, Lemmatizers, etc and try to use the best one for your task! 

from PIL import Image
pil_im = Image.open('./Dataset/Text_preprocessing_example.png')
display(pil_im)

## Tokenization using nltk library
 Links to a few word and sentence tokenizers- https://www.nltk.org/howto/tokenize.html

In [25]:
# Example showing tokenization of the Tweets into words for an instance

from nltk.tokenize import word_tokenize  

print(data_df.OriginalTweet.tolist()[100])
print("\n")
print(word_tokenize(data_df.OriginalTweet.tolist()[100]))

## Stemming
Links to a few nltk Stemmers- https://www.nltk.org/howto/stem.html

In [26]:
# Example showing stemming of words

from nltk.stem import PorterStemmer 
ps = PorterStemmer() 

print(data_df.OriginalTweet.tolist()[51])
print("\n")

tweet = data_df.OriginalTweet.tolist()[51].split()

for word in tweet:
    print(word," ",ps.stem(word) ," ")