# Importing packages and loading files

In [None]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test (1).csv to test (1).csv
Saving training.csv to training.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(uploaded['training.csv']))
df

Unnamed: 0,id,text,label
0,1,i didnt feel humiliated,0
1,2,i can go from feeling so hopeless to so damned...,0
2,3,im grabbing a minute to post i feel greedy wrong,3
3,4,i am ever feeling nostalgic about the fireplac...,2
4,5,i am feeling grouchy,3
...,...,...,...
17995,17996,im having ssa examination tomorrow in the morn...,0
17996,17997,i constantly worry about their fight against n...,1
17997,17998,i feel its important to share this info for th...,1
17998,17999,i truly feel that if you are passionate enough...,1


#Pre-processing

In [None]:
!pip install contractions
import contractions
df['text'][0]=contractions.fix(df['text'][0])

Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 5.2 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 42.9 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85446 sha256=f59ed1c4214b29932761ee4432c6499ab27379e01d419dd390439e4c589f2b5b
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully install

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
for i in df['text']:
  i=contractions.fix(i)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('im')
stop_words.add('ive')
#stop_words.add('feel')
#stop_words.add('feeling')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Tokenizer

In [None]:
X_train=df['text']
Y_train=df['label']

In [None]:
tk = Tokenizer(num_words=18000,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)

In [None]:
X_train.describe()

count             18000
unique            17834
top       feel tortured
freq                  5
Name: text, dtype: object

In [None]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    18000.000000
mean         9.147722
std          5.150398
min          1.000000
25%          5.000000
50%          8.000000
75%         12.000000
max         35.000000
Name: text, dtype: float64

In [None]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=35)

In [None]:
X_train_seq_trunc[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  48,   1, 540], dtype=int32)

In [None]:
le = LabelEncoder()
Y_train_le = le.fit_transform(Y_train)

In [None]:
Y_train_le

array([0, 0, 3, ..., 1, 1, 1])

In [None]:
Y_train_cat = to_categorical(Y_train_le)

In [None]:
Y_train_cat

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]], dtype=float32)

In [None]:
y_train_oh = to_categorical(Y_train_le)

In [None]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [None]:
assert X_valid_emb.shape[0] == y_valid_emb.shape[0]
assert X_train_emb.shape[0] == y_train_emb.shape[0]

#Model Building

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(18000, 8, input_length=35))
emb_model.add(layers.GlobalMaxPool1D())
emb_model.add(layers.Dense(6, activation='softmax'))
emb_model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 35, 8)             144000    
                                                                 
 global_max_pooling1d_8 (Glo  (None, 8)                0         
 balMaxPooling1D)                                                
                                                                 
 dense_14 (Dense)            (None, 6)                 54        
                                                                 
Total params: 144,054
Trainable params: 144,054
Non-trainable params: 0
_________________________________________________________________


In [None]:
emb_model.compile(optimizer='adam' , loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
emb_model.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 35, 8)             144000    
                                                                 
 global_max_pooling1d_8 (Glo  (None, 8)                0         
 balMaxPooling1D)                                                
                                                                 
 dense_14 (Dense)            (None, 6)                 54        
                                                                 
Total params: 144,054
Trainable params: 144,054
Non-trainable params: 0
_________________________________________________________________


In [None]:
emb_model.fit(X_train_emb,y_train_emb, validation_data=(X_valid_emb, y_valid_emb), epochs=11, batch_size=32 )
#Trained the model twice

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x7f9bd02cac90>

#Testing

In [None]:
df_test=pd.read_csv(io.BytesIO(uploaded['test (1).csv']))
df_test

Unnamed: 0,id,text
0,1,im feeling rather rotten so im not very ambiti...
1,2,im updating my blog because i feel shitty
2,3,i never make her separate from me because i do...
3,4,i left with my bouquet of red and yellow tulip...
4,5,i was feeling a little vain when i did this one
...,...,...
1995,1996,i just keep feeling like someone is being unki...
1996,1997,im feeling a little cranky negative after this...
1997,1998,i feel that i am useful to my people and that ...
1998,1999,im feeling more comfortable with derby i feel ...


In [None]:
for i in df_test['text']:
  i=contractions.fix(i)

In [None]:
df_test['text'] = df_test['text'].apply(lambda x: remove_stopwords(x))

In [None]:
X_test=df_test['text']
X_test

0                   feeling rather rotten ambitious right
1                               updating blog feel shitty
2         never make separate ever want feel like ashamed
3       left bouquet red yellow tulips arm feeling sli...
4                                 feeling little vain one
                              ...                        
1995    keep feeling like someone unkind wrong think g...
1996    feeling little cranky negative doctors appoint...
1997    feel useful people gives great feeling achieve...
1998    feeling comfortable derby feel though start st...
1999    feel weird meet w people text like dont talk f...
Name: text, Length: 2000, dtype: object

In [None]:
X_test_seq = tk.texts_to_sequences(X_test)

In [None]:
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=35)

In [None]:
output=emb_model.predict(X_test_seq_trunc)

In [None]:
output1=np.argmax(output, axis=1)

In [None]:
output1=le.inverse_transform(output1)

In [None]:
output1.shape

(2000,)

In [None]:
df_test['label'] = output1.tolist()

In [None]:
df_test=df_test.drop(columns=['text'])
df_test

Unnamed: 0,id,label
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
1995,1996,3
1996,1997,3
1997,1998,1
1998,1999,1


In [None]:
df_test.to_csv("sample_submission.csv",index=False)

In [None]:
from google.colab import files
files.download("sample_submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install contractions
import contractions
df['text'][0]=contractions.fix(df['text'][0])



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
for i in df['text']:
  i=contractions.fix(i)

In [None]:
for i in df_test['text']:
  i=contractions.fix(i)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.add('im')
stop_words.add('ive')
stop_words.add('feel')
stop_words.add('feeling')
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_test['text'] = df_test['text'].apply(lambda x: remove_stopwords(x))

In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
X_train=df['text']
X_train

0                                               humiliated
1        go hopeless damned hopeful around someone care...
2                        grabbing minute post greedy wrong
3             ever nostalgic fireplace know still property
4                                                  grouchy
                               ...                        
17995    ssa examination tomorrow morning quite well pr...
17996    constantly worry fight nature push limits inne...
17997                important share info experience thing
17998    truly passionate enough something stay true su...
17999         like wanna buy cute make see online even one
Name: text, Length: 18000, dtype: object

In [None]:
vectorize_layer.adapt(X_train)

In [None]:
X_train

0                                               humiliated
1        go hopeless damned hopeful around someone care...
2                        grabbing minute post greedy wrong
3             ever nostalgic fireplace know still property
4                                                  grouchy
                               ...                        
17995    ssa examination tomorrow morning quite well pr...
17996    constantly worry fight nature push limits inne...
17997                important share info experience thing
17998    truly passionate enough something stay true su...
17999         like wanna buy cute make see online even one
Name: text, Length: 18000, dtype: object

In [None]:
Y_train=df['label']

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
vectorize_text(df['text'][0],df['label'][0])

(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
 array([[547,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

In [None]:
embedding_dim = 16
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 16)          160016    
                                                                 
 dropout_2 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
________________________________________________

In [None]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [None]:
history = model.fit(x=X1, y=Y1, epochs = 10)

Epoch 1/10


ValueError: ignored

In [None]:
df.dtypes

id        int64
text     object
label     int64
dtype: object

In [None]:
df["text"] = df["text"].astype(str)

In [None]:
X1=X_train.to_numpy()

In [None]:
Y1=Y_train.to_numpy()
Y1

array([0, 0, 3, ..., 1, 1, 1])