In [None]:
import pandas as pd

# Loading the data
train_raw = pd.read_csv('../data/train.csv', sep=',')
test_raw = pd.read_csv('../data/eval.csv', sep=',')

train_raw.head(5)

In [None]:
train_raw[train_raw["lang"]!="en"].head(40)

In [None]:
## Check null values and n_unique
pd.concat([train_raw.isnull().sum(), train_raw.nunique()], axis=1).rename(columns = {0: 'is_null', 1: 'n_unique' })

In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

nlp = English()
tokenizer = Tokenizer(nlp.vocab)

# train_raw= train_raw.drop(columns=['id','time','smth'])
train_raw['sent'] = train_raw['sent'].map(lambda label: True if label == 'Neutral' else False)
train_raw['len_tweet'] = train_raw['tweet'].map(lambda tweet: len(tweet))
train_raw['tokens'] = train_raw['tweet'].map(lambda tweet: [tok.text for tok in tokenizer(tweet)])
train_raw['n_tokens'] = train_raw['tokens'].map(lambda tokens: len(tokens))
train_raw['n_urls'] = train_raw['tokens'].map(lambda tokens: len( [tok for tok in tokens if 'http' in tok] ))


train_raw.head(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15,5))
plt.subplot(1,3,1)
sns.histplot(data=train_raw,x='n_tokens',kde=True, bins=20,hue='sent')
plt.subplot(1,3,2)
sns.histplot(data=train_raw,x='len_tweet',kde=True, bins=20,hue='sent')
plt.subplot(1,3,3)
sns.histplot(data=train_raw,x='n_urls',kde=True, bins=20,hue='sent')
plt.show()

In [None]:
# Heatmap for later use
plt.figure(figsize=(12, 8))
sns.set(font_scale=1.5)
sns.heatmap(train_raw.corr(), cmap='coolwarm', annot=True, annot_kws={'size':15})
plt.show()

In [None]:
# # This is for spell checking, might be useful

# from tqdm import tqdm
# import pkg_resources
# from symspellpy import SymSpell, Verbosity

# sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# dictionary_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_dictionary_en_82_765.txt")
# bigram_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# # term_index is the column of the term and count_index is the
# # column of the term frequency
# sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
# sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# def check_spell(input_term):
#     # lookup suggestions for multi-word input strings (supports compound
#     # splitting & merging)
#     # max edit distance per lookup (per single word, not per whole input string)
#     suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
#     # display suggestion term, edit distance, and term frequency
#     pbar.update()
#     return suggestions[0]._term

# pbar = tqdm(total=len(train['search_term']))
# train['search_term'] = train['search_term'].map(check_spell)
# pbar.close()