In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sb

In [2]:
Train_path = "..\datasets\BuzzInSocialMedia\Twitter\Twitter.data"
train_df = pd.read_csv(Train_path, header = None)
X, y = train_df.values[:, :-1], train_df.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
for col in train_df.columns:
    if not (train_df[col].dtype == "int64" or train_df[col].dtype == "float64"):
        print(col, train_df[col].dtype)

In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(466600, 77)
(116650, 77)
(466600,)
(116650,)


In [6]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
0,0,2,0,0,1,1,1,0,1,0,...,1.0,1.0,0,2,0,0,1,1,1,0.0
1,2,1,0,0,0,0,4,2,1,0,...,0.0,1.0,2,1,0,0,0,0,4,0.5
2,1,0,0,0,0,4,1,1,0,0,...,1.0,1.0,1,0,0,0,0,4,1,0.0
3,1,0,0,1,0,0,1,1,0,0,...,0.0,1.0,1,0,0,1,0,0,1,2.5
4,0,1,0,0,1,2,3,0,1,0,...,1.0,1.0,0,1,0,0,1,2,3,0.5


In [7]:
print(train_df.isnull().sum().sum(), train_df.isna().sum().sum())

0 0


In [8]:
print(train_df.columns)

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77],
           dtype='int64')


In [9]:
list_dtype = [train_df[i].dtype for i in train_df.columns]
dict((i, list_dtype.count(i)) for i in list_dtype)

{dtype('int64'): 35, dtype('float64'): 43}

All the columns are float type. Hence one hot encoding is not required.

In [11]:
print(train_df[0].value_counts())

1       57452
0       40909
2       34530
3       24658
4       18832
        ...  
1934        1
2458        1
2253        1
2725        1
4779        1
Name: 0, Length: 4410, dtype: int64


This shows the class imbalance

In [13]:
for col in train_df.columns:
    if len(train_df[col].unique()) < 5:
        print(col)
        print(train_df.groupby([col, 0])[0].count())
        # break

42
42   0    
0.0  0        40360
1.0  0          549
     1        57452
     2        34530
     3        24658
              ...  
     20495        1
     22175        1
     22899        1
     23604        1
     24210        1
Name: 0, Length: 4411, dtype: int64
43
43   0    
0.0  0          217
     1        25059
     2         8518
     3         3697
     4         1721
              ...  
1.0  20495        1
     22175        1
     22899        1
     23604        1
     24210        1
Name: 0, Length: 5344, dtype: int64
44
44   0    
0.0  0        17279
     1        19105
     2         6629
     3         2900
     4         1479
              ...  
1.0  20495        1
     22175        1
     22899        1
     23604        1
     24210        1
Name: 0, Length: 5099, dtype: int64
45
45   0    
0.0  0        15134
     1        16153
     2         5676
     3         2499
     4         1255
              ...  
1.0  20495        1
     22175        1
     22899      