## Import Python Libraries 

In [None]:
# Import libraries here that you need for different processing steps
import nltk
import csv
import spacy
import pandas as pd

## Functions to read data and convert it into dataframe

In [None]:
# Reading a dataset from a csv file into a dataframe. Run this cell to load the dataset in memory 
# and see shape and data types of the data.

df1 = pd.read_csv('./Dataset/covid.csv')
print(df1.dtypes)
print(df1.shape)

df1.head()

In [None]:
# Reading a dataset from a csv zip file into a dataframe. Run this cell to load the dataset in memory 
# and see shape and data types of the data.

df2 = pd.read_csv('./Dataset/covid.csv.zip',
                  compression='zip',encoding="UTF-8", header=0, sep=',', quotechar='"')

print(df2.dtypes)
print(df2.shape)

df2.head()

## Function to write data to a csv file

In [None]:
# Run this cell to save the subset of the dataframe to a csv file.
subset = df2[0:2000]
subset.to_csv('./Dataset/covid_subset.csv', index=None)

## Reading a CSV into a list for processing line by line

In [None]:
import numpy as np
# Run this cell to read the dataframe from a csv file into a list of list.

with open('./Dataset/covid_subset.csv',encoding="UTF-8") as f:
    content = f.readlines()
    
lines = np.array(content) 
num_of_instances = lines.size

print(num_of_instances)
print(lines[0])
print(lines[1])
print(lines[2])

## Data Exploration

### Length of text
##### Identify the text column on which you have to apply text processing in the dataset

In [None]:
# Run the cell to see the behaviour of the textual attributes in the dataset

print(df1.OriginalTweet.str.len().max())

### Describe data columns

In [None]:
# Run the cell to see the behaviour of the attributes in the dataset. This information might or might not be meaningful
# depending on the column

df1.describe()

### Tokenize sentences

In [None]:
# Run this cell to see the Tweet in the sixth row of the dataset

six_dialogue = df1.loc[6, "OriginalTweet"]
print(six_dialogue)

#### Tokenize sentences using nltk

In [None]:
nltk.download('punkt_tab')

In [None]:
# Run this cell to see the Tweet in the six row of the dataset in the form of Tokens using nltk library

from nltk.tokenize import sent_tokenize

sent_tokenize(six_dialogue)

#### Tokenize sentences using spacy

In [None]:
# !python -m spacy download en

In [None]:
# Run this cell to see the Tweet in the first row of the dataset in the form of Tokens using another library

from spacy.lang.en import English

nlp = spacy.load("en_core_web_sm")
[sent.text for sent in nlp(six_dialogue).sents]

### Tokenize words

In [None]:
# Run this cell to see the Tweet in the six row of the dataset

six_dialogue = df1.loc[6, "OriginalTweet"]
print(six_dialogue)

#### Tokenize words using nltk

In [None]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

words = [x for x in tok.tokenize(six_dialogue)]
print(words)

#### Tokenize words using spacy

In [None]:
from spacy.lang.en import English
nlp = English()

# Creating a Tokenizer with the default settings for English
tokenizer = nlp.tokenizer
doc = tokenizer(six_dialogue)
words = [token.text for token in doc]
print(words)

### Word Clouds

Just a fancy way to see what all words appear in your textual columns. Run the below cells to see what words occurs in the Tweets. Don't worry about the code.

In [None]:
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

In [None]:
text = " ".join(str(i) for i in df1.OriginalTweet)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()