In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
import os
os.chdir('/content/drive/My Drive/ML')

In [0]:
import pandas as pd
from datetime import datetime

In [0]:
nba_df = pd.read_csv('nba.tsv', header=None, sep='\t', names=['text', 'date'])
nba_df.head()

Unnamed: 0,text,date
0,You don’t say,1541277000.0
1,Joel Embiid out here clowning on everyone. Bes...,1541277000.0
2,I hope you talking about centers ?,1541277000.0
3,He staying lmao,1541277000.0
4,EMBIID OWNS PROPERTY IN ANDRE DRUMMONDS HEAD.,1541277000.0


# Cleaning dates

Filling empty values in Date column with interpolate method and parsing dates

In [0]:
nba_df['date'].interpolate(inplace=True)
nba_df['date'] = pd.to_datetime(nba_df['date'], unit='s')                         #nba_df['date'].apply(datetime.fromtimestamp)


In [0]:
nba_df.head()

Unnamed: 0,text,date
0,You don’t say,2018-11-03 20:36:51
1,Joel Embiid out here clowning on everyone. Bes...,2018-11-03 20:36:53
2,I hope you talking about centers ?,2018-11-03 20:36:53
3,He staying lmao,2018-11-03 20:36:59
4,EMBIID OWNS PROPERTY IN ANDRE DRUMMONDS HEAD.,2018-11-03 20:38:05


In [0]:
nba_df.shape

(34648, 2)

# Cleaning text
Removing links, hashtags, special characters and lemmatizing text.

In [0]:
!pip3 install nltk



In [0]:
!pip3 install contractions



In [0]:
import re
import contractions
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

In [0]:
def clean_text(text):
  '''
  Remove unncessary things from the tweet 
  like mentions, hashtags, URL links, punctuations
  '''
  wordnet_lemmatizer = WordNetLemmatizer()
  # remove hyperlinks
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
  # only removing the hash # sign from the word
  text = re.sub(r'#', '', text)
  # remove mentions
  text = re.sub(r'@[A-Za-z0-9]+', '', text) 
  #replace contractions
  text = contractions.fix(text)
  #remove special characters and lower all words
  clean_text = re.sub(r'[^0-9A-Za-z \t]', '', text).lower().split()
  #lemmatazing text
  lemma_text = [wordnet_lemmatizer.lemmatize(word) for word in clean_text]
  join_text = ' '.join(lemma_text)
  return join_text 
 

In [0]:
#creating a new column with clean text
nba_df['clean_text'] = nba_df.text.apply(clean_text)
nba_df.head()

Unnamed: 0,text,date,clean_text
0,You don’t say,2018-11-03 20:36:51,you do not say
1,Joel Embiid out here clowning on everyone. Bes...,2018-11-03 20:36:53,joel embiid out here clowning on everyone best...
2,I hope you talking about centers ?,2018-11-03 20:36:53,i hope you talking about center
3,He staying lmao,2018-11-03 20:36:59,he staying lmao
4,EMBIID OWNS PROPERTY IN ANDRE DRUMMONDS HEAD.,2018-11-03 20:38:05,embiid owns property in andre drummonds head


In [0]:
#replacing empty string with NaN and dropping empty rows
import numpy as np
nba_df.clean_text = nba_df.clean_text.replace('', np.nan)
nba_df.dropna(how='any', inplace=True)

In [0]:
nba_df.shape

(34082, 3)

In [0]:
#round up seconds
#nba_df['date'] = pd.to_datetime(nba_df.date)
#nba_df['date'] = nba_df.date.dt.round('S')

In [0]:
nba_df.dtypes

text                  object
date          datetime64[ns]
clean_text            object
dtype: object

In [0]:
#saving as a clean file
nba_df.to_csv('clean_nba_tweets.csv', index=False, date_format='%Y-%m-%d %H:%M:%S', encoding='utf-8')