## Importing Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
import os
import torch

## Checking if GPU's available and weather there are multiple GPU's available.


In [2]:
if torch.cuda.is_available():
  device=torch.device("cuda")
  print("There are {} GPU's available.".format(torch.cuda.device_count()))
  print("We will use the GPU:{}".format(torch.cuda.get_device_name(0)))
else:
  print("No Gpu available...using the CPU instead.")
  device=torch.device('cpu')

There are 1 GPU's available.
We will use the GPU:Tesla T4


## Mounting Google Drive
   We mount Google drive so that we can store serialized objects and intermediate data.

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
os.getcwd()
os.chdir('/content/gdrive/My Drive/Colab Notebooks')

## Loading in the Dataset.

In [5]:
path='/content/gdrive/My Drive/Colab Notebooks/sentiment140.csv'
covid_data=pd.read_csv(path,encoding='ISO-8859-1',parse_dates=True)

In [6]:
column_names=["target","ids","date","flag","user","text"]
covid_data.columns=column_names
covid_data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Checking for Repitative values.

In [None]:
covid_data.shape

(1599999, 6)

In [None]:
covid_data.ids.nunique()

1598314

### Here we can see that nearly 1685 ids are either missing or are duplicated or repititive.

In [None]:
covid_data.ids.isnull().sum()

0

In [None]:
covid_data.ids.value_counts()

1753678185    2
1678693526    2
1835870889    2
2190104868    2
1984377787    2
             ..
2056931811    1
2212131300    1
2188589814    1
1983537638    1
1964587131    1
Name: ids, Length: 1598314, dtype: int64

In [None]:
covid_data.ids.value_counts().unique()

array([2, 1])

In [None]:
covid_data[covid_data['ids']==1753678185]

Unnamed: 0,target,ids,date,flag,user,text
84765,0,1753678185,Sun May 10 02:51:55 PDT 2009,NO_QUERY,BoydyxO,sunnn finnalllyyy!! aint slept :| need some s...
918538,4,1753678185,Sun May 10 02:51:55 PDT 2009,NO_QUERY,BoydyxO,sunnn finnalllyyy!! aint slept :| need some s...


In [None]:
covid_data[covid_data['ids']==1984377787]

Unnamed: 0,target,ids,date,flag,user,text
254973,0,1984377787,Sun May 31 14:59:31 PDT 2009,NO_QUERY,15AMR,I'm broke cuz she got all she wanted! Going ho...
1194018,4,1984377787,Sun May 31 14:59:31 PDT 2009,NO_QUERY,15AMR,I'm broke cuz she got all she wanted! Going ho...


In [None]:
covid_data[covid_data['ids']==1835870889]

Unnamed: 0,target,ids,date,flag,user,text
132902,0,1835870889,Mon May 18 06:55:21 PDT 2009,NO_QUERY,lillekerohus,I'm so nervous about my guitar exam tomorrow. ...
996346,4,1835870889,Mon May 18 06:55:21 PDT 2009,NO_QUERY,lillekerohus,I'm so nervous about my guitar exam tomorrow. ...


### from the above cells we can observe that there are no missing values present but some id values are not unique and are repeated.The whole tweet is repeated and no tweet is repeated more than twice and the repeated tweets have both positive and negative sentiment labelled on it.

In [None]:
covid_data.set_index('ids',inplace=True)

In [None]:
t=covid_data.index.value_counts()==2
temp=t.loc[t[t.index]==True]
dup_indexes=temp.index
#dup_data=covid_data.loc[covid_data.ids
print(dup_indexes)

Int64Index([1753678185, 1678693526, 1835870889, 2190104868, 1984377787,
            1972032864, 2062313315, 2057992213, 1978945483, 1827871029,
            ...
            2191418783, 2013668899, 2190980212, 2051228740, 2059765907,
            2044816045, 1971122723, 1957740028, 2015887552, 1835205913],
           dtype='int64', length=1685)


In [None]:
covid_data.drop(list(dup_indexes),inplace=True)


In [None]:
covid_data.shape

(1596629, 5)

## Selecting only Relevant Columns.

In [None]:
covid_data_mod=covid_data[['target','date','text']]
covid_data_mod.head()

Unnamed: 0_level_0,target,date,text
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1467810672,0,Mon Apr 06 22:19:49 PDT 2009,is upset that he can't update his Facebook by ...
1467810917,0,Mon Apr 06 22:19:53 PDT 2009,@Kenichan I dived many times for the ball. Man...
1467811184,0,Mon Apr 06 22:19:57 PDT 2009,my whole body feels itchy and like its on fire
1467811193,0,Mon Apr 06 22:19:57 PDT 2009,"@nationwideclass no, it's not behaving at all...."
1467811372,0,Mon Apr 06 22:20:00 PDT 2009,@Kwesidei not the whole crew


## Checking for null values in the remaining columns.



In [None]:
covid_data_mod.isnull().sum()

target    0
date      0
text      0
dtype: int64

## Converting the dates into a Standard Format.

In [None]:
covid_data_mod['date']=covid_data_mod['date'].apply(lambda x: pd.to_datetime(x).strftime('%m/%d/%Y'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
covid_data_mod.head()

Unnamed: 0_level_0,target,date,text
ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1467810672,0,04/06/2009,is upset that he can't update his Facebook by ...
1467810917,0,04/06/2009,@Kenichan I dived many times for the ball. Man...
1467811184,0,04/06/2009,my whole body feels itchy and like its on fire
1467811193,0,04/06/2009,"@nationwideclass no, it's not behaving at all...."
1467811372,0,04/06/2009,@Kwesidei not the whole crew


### Shuffling the Dataset.

In [None]:
covid_data_mod=covid_data_mod.sample(frac=1)
covid_data_mod.target.values[:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 4, 0, 0, 4, 0, 0, 4, 4,
       0, 0, 0, 4, 4, 0, 0, 0, 4, 0, 4, 4, 4, 4, 4, 0, 4, 0, 0, 4, 4, 4,
       4, 4, 0, 0, 0, 4])

## Serializing the DataFrame.

In [23]:
os.getcwd()

'/content/gdrive/My Drive/Colab Notebooks/data_chunks'

In [None]:
covid_data_mod.to_pickle("covid_data_mod_pickled")

In [7]:
data=pd.read_pickle("covid_data_mod_pickled")

In [8]:
data.target=data.target.replace({0:0,4:1})

In [9]:
data.target.value_counts()

1    798315
0    798314
Name: target, dtype: int64

In [10]:
possible_labels=data.target.unique()
possible_labels

array([0, 1])

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(data.index.values,data.target.values,test_size=0.20,random_state=11,stratify=data.target.values)


In [12]:
data['data_type']=['not_set']*data.shape[0]

In [13]:
data.data_type.loc[x_train]='train'
data.data_type.loc[x_val]='val'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [14]:
data.groupby(['target','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,text
target,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1
0,train,638651,638651
0,val,159663,159663
1,train,638652,638652
1,val,159663,159663


In [15]:
path='/content/gdrive/My Drive/Colab Notebooks/data_chunks'
os.chdir(path)

In [None]:
#os.mkdir("data_chunks")

In [24]:
os.getcwd()

'/content/gdrive/My Drive/Colab Notebooks/data_chunks'

In [21]:
train_data=data.loc[x_train]
train_data.drop(['date'],axis=1,inplace=True)
val_data=data.loc[x_val]
val_data.drop(['date'],axis=1,inplace=True)

In [None]:
len(train_data.target.values==y_train)

1277303

In [None]:
len(val_data.target.values==y_val)

319326

In [None]:
x_train.shape

(1277303,)

In [None]:
y_train.shape

(1277303,)

In [25]:
i=0
chunk_size=100000
count=1
while count<14:
  if count<13:
    chunk=train_data.iloc[i:chunk_size+i,:]
    i=i+chunk_size
  else:
    chunk=train_data.iloc[i:,:]
  chunk.to_pickle("train_chunk_{}".format(count))
  print("finished working on train_chunk_{}".format(count))
  count+=1

finished working on train_chunk_1
finished working on train_chunk_2
finished working on train_chunk_3
finished working on train_chunk_4
finished working on train_chunk_5
finished working on train_chunk_6
finished working on train_chunk_7
finished working on train_chunk_8
finished working on train_chunk_9
finished working on train_chunk_10
finished working on train_chunk_11
finished working on train_chunk_12
finished working on train_chunk_13


In [26]:
val_data.to_pickle("val_chunk")