# Kaggle Contest: Twitter tweet classification
1. Importing Numpy and Pandas libraries

In [1]:
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


2. Loading the dataset

In [2]:
data=pd.read_csv(r'train.csv')
data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [3]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

3. Filtering Text and Target columns as they are useful for the classification purpose

In [4]:
new_data=data.drop(['id','keyword','location'],axis=1)
new_data

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


4. Download and import NLTK libraries to perform Text Pre-processing

In [5]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

5. Convert to lowercase

In [7]:
def lowercase_and_strip(text):
    text = text.lower().strip()
    return text
for i in range(len(new_data)):
    processed_text = lowercase_and_strip(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @thetawniest the out of control w...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1
7611,police investigating after an e-bike collided ...,1


6. Remove HTML tags

In [8]:
def remove_html_tags(text):
    text = re.compile('<.*?>').sub('', text)
    return text

for i in range(len(new_data)):
    processed_text = remove_html_tags(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this #earthquake m...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,just got sent this photo from ruby #alaska as ...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @thetawniest the out of control w...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1
7611,police investigating after an e-bike collided ...,1


7. Remove Usermentions and Hashtags

In [9]:
def replace_mentions_and_hashtags(text):
    # Improved regex to handle usernames with underscores
    text = re.sub(r'@[\w_]+', '', text)
    hashtags = re.findall(r'#(\w+)', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    return text

for i in range(len(new_data)):
    processed_text = replace_mentions_and_hashtags(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive wildfires evacuation ord...",1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,the out of control wild fires in california ...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1
7611,police investigating after an e-bike collided ...,1


In [10]:
new_data['text'][6276]

'new item: pillow covers any size pillow cover grey pillow pillows premier prints lulu storm grey by mypillowstudio \x89û_ http://t.co/m4pqkkeevc'

8. Remove URLs or Hyperlinks

In [11]:
def remove_links(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    return text

for i in range(len(new_data)):
    processed_text = remove_links(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive wildfires evacuation ord...",1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,the out of control wild fires in california ...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii.,1
7611,police investigating after an e-bike collided ...,1


In [12]:
new_data['text'][6276]

'new item: pillow covers any size pillow cover grey pillow pillows premier prints lulu storm grey by mypillowstudio \x89û_ '

9. Replace punctuation with space

In [13]:
def replace_punctuation_with_space(text):
    text = re.sub('\s+', ' ', text)
    return text

for i in range(len(new_data)):
    processed_text = replace_punctuation_with_space(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask. canada,1
2,all residents asked to 'shelter in place' are ...,1
3,"13,000 people receive wildfires evacuation ord...",1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,the out of control wild fires in california e...,1
7610,m1.94 [01:04 utc]?5km s of volcano hawaii.,1
7611,police investigating after an e-bike collided ...,1


10. Remove special characters

In [14]:
def remove_special_characters(text):
    text = re.compile('[^A-Za-z0-9@#]+').sub(' ', text)
    return text

for i in range(len(new_data)):
    processed_text = remove_special_characters(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13 000 people receive wildfires evacuation ord...,1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,the out of control wild fires in california e...,1
7610,m1 94 01 04 utc 5km s of volcano hawaii,1
7611,police investigating after an e bike collided ...,1


In [15]:
new_data['text'][6276]

'new item pillow covers any size pillow cover grey pillow pillows premier prints lulu storm grey by mypillowstudio '

11. Remove Stopwords

In [16]:
def remove_stopwords(text):
    stopwords = ['against', 'not', 'don', 'don\'t','ain', 'are', 'aren\'t', 'could', 'couldn\'t',
             'did', 'didn\'t', 'does', 'doesn\'t', 'had', 'hadn\'t', 'has', 'hasn\'t', 
             'have', 'haven\'t', 'is', 'isn\'t', 'might', 'mightn\'t', 'must', 'mustn\'t',
             'need', 'needn\'t','should', 'shouldn\'t', 'was', 'wasn\'t', 'were', 
             'weren\'t', 'won\'t', 'would', 'wouldn\'t',"a", "an", "the", "this", "that", "is", "it", "to", "and","of","over"]
    words = word_tokenize(text)
    filtered_sentence = [w for w in words if w.lower() not in stopwords]
    text = " ".join(filtered_sentence)
    return text

for i in range(len(new_data)):
    processed_text = remove_stopwords(new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deeds reason earthquake may allah forgive ...,1
1,forest fire near la ronge sask canada,1
2,all residents asked shelter in place being not...,1
3,13 000 people receive wildfires evacuation ord...,1
4,just got sent photo from ruby alaska as smoke ...,1
...,...,...
7608,two giant cranes holding bridge collapse into ...,1
7609,out control wild fires in california even in n...,1
7610,m1 94 01 04 utc 5km s volcano hawaii,1
7611,police investigating after e bike collided wit...,1


12. Lemmatization using WordNetLemmatizer

In [17]:
def lemmatize_text(lemmatizer, text):
    # lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    text = " ".join(lemmatized_words)
    return text

lemmatizer = WordNetLemmatizer()
for i in range(len(new_data)):
    processed_text = lemmatize_text(lemmatizer, new_data['text'][i])
    new_data.loc[i, 'text'] = processed_text
new_data

Unnamed: 0,text,target
0,our deed reason earthquake may allah forgive u...,1
1,forest fire near la ronge sask canada,1
2,all resident asked shelter in place being noti...,1
3,13 000 people receive wildfire evacuation orde...,1
4,just got sent photo from ruby alaska a smoke f...,1
...,...,...
7608,two giant crane holding bridge collapse into n...,1
7609,out control wild fire in california even in no...,1
7610,m1 94 01 04 utc 5km s volcano hawaii,1
7611,police investigating after e bike collided wit...,1


In [18]:
new_data['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

13. Train-test split

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(new_data[['text']],
                                                  new_data['target'],
                                                  test_size=0.20,
                                                  shuffle=True,
                                                  random_state=324
                                                 )

X_val, X_test, y_val, y_test = train_test_split(X_val,
                                                y_val,
                                                test_size=0.5,
                                                shuffle=True,
                                                random_state=324)

In [20]:
# Grab model features/inputs and target/output

text_features = ['text']

model_features = text_features
model_target = 'target'

14. TF-IDF Vectorization

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['text'])
X_val_tfidf = vectorizer.transform(X_val['text'])
X_test_tfidf = vectorizer.transform(X_test['text'])

In [22]:
X_train['text']

1640    pain those second been awful a her heart burst...
2184    aircraft debris found on island from mh370 mal...
4747    what you gon na do now puppy no more destroyin...
4944    straight bass dubloadz droppd when they opened...
4493                    haha i love hurricane because you
                              ...                        
600     breakingnews fedex no longer willing transport...
1531    russian nuclear biological chemical nbc brigad...
908                                mega bloody marvellous
4852    s like god want me become mass murderer with h...
2713    ignition knock detonation sensor senso standar...
Name: text, Length: 6090, dtype: object

In [23]:
X_train=X_train_tfidf.toarray()
X_val=X_val_tfidf.toarray()
X_test=X_test_tfidf.toarray()

In [24]:
X_train.shape

(6090, 11832)

15. Training a classifier with a built-in SageMaker algorithm

In [25]:
import sagemaker

# Call the LinearLearner estimator object
linear_classifier = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                           instance_count=1,
                                           instance_type='ml.m4.xlarge',
                                           predictor_type='binary_classifier')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [26]:
train_records = linear_classifier.record_set(X_train.astype('float32'),
                                            y_train.values.astype('float32'),
                                            channel='train')
val_records = linear_classifier.record_set(X_val.astype('float32'),
                                          y_val.values.astype('float32'),
                                          channel='validation')
test_records = linear_classifier.record_set(X_test.astype('float32'),
                                           y_test.values.astype('float32'),
                                           channel='test')

In [27]:
train_records

(<class 'sagemaker.amazon.amazon_estimator.RecordSet'>, {'s3_data': 's3://sagemaker-us-east-1-534518675133/sagemaker-record-sets/LinearLearner-2024-02-24-21-00-32-102/.amazon.manifest', 'feature_dim': 11832, 'num_records': 6090, 's3_data_type': 'ManifestFile', 'channel': 'train'})

In [28]:
linear_classifier.fit([train_records,
                       val_records,
                       test_records],
                       logs=False)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: linear-learner-2024-02-24-21-01-18-701



2024-02-24 21:01:19 Starting - Starting the training job.
2024-02-24 21:01:33 Starting - Preparing the instances for training........
2024-02-24 21:02:18 Downloading - Downloading input data.......
2024-02-24 21:02:58 Downloading - Downloading the training image...............
2024-02-24 21:04:18 Training - Training image download completed. Training in progress...........
2024-02-24 21:05:14 Uploading - Uploading generated training model..
2024-02-24 21:05:30 Completed - Training job completed


In [31]:
test_records

(<class 'sagemaker.amazon.amazon_estimator.RecordSet'>, {'s3_data': 's3://sagemaker-us-east-1-534518675133/sagemaker-record-sets/LinearLearner-2024-02-24-20-13-51-736/.amazon.manifest', 'feature_dim': 12899, 'num_records': 762, 's3_data_type': 'ManifestFile', 'channel': 'test'})

In [29]:
sagemaker.analytics.TrainingJobAnalytics(linear_classifier._current_job_name, 
                                         metric_names = ['test:binary_classification_accuracy']
                                        ).dataframe()

Unnamed: 0,timestamp,metric_name,value
0,0.0,test:binary_classification_accuracy,0.751969


# Loading and using the model to predict on Test Data

In [30]:
test=pd.read_csv(r'test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [31]:
test=test.drop(['id','keyword','location'],axis=1)
test

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [32]:

for i in range(len(test)):
    processed_text = lowercase_and_strip(test['text'][i])
    processed_text = remove_html_tags(processed_text)
    processed_text = replace_mentions_and_hashtags(processed_text)
    processed_text = remove_links(processed_text)
    processed_text = remove_special_characters(processed_text)

    processed_text = replace_punctuation_with_space(processed_text)
    processed_text = remove_stopwords(processed_text)
    processed_text = lemmatize_text(lemmatizer, processed_text)
    
    test.loc[i, 'text'] = processed_text
    

test


Unnamed: 0,text
0,just happened terrible car crash
1,heard about earthquake different city stay saf...
2,there forest fire at spot pond goose fleeing a...
3,apocalypse lighting spokane wildfire
4,typhoon soudelor kill 28 in china taiwan
...,...
3258,earthquake safety los angeles safety fastener ...
3259,storm in ri worse than last hurricane my city ...
3260,green line derailment in chicago
3261,meg issue hazardous weather outlook hwo


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

Test_tfidf = vectorizer.transform(test['text'])
Test=Test_tfidf.toarray()
Test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
Test.shape

(3263, 11832)

In [35]:
linear_classifier_predictor = linear_classifier.deploy(initial_instance_count = 1,
                                                       instance_type = 'ml.c5.large'
                                                      )

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: linear-learner-2024-02-24-21-06-31-638
INFO:sagemaker:Creating endpoint-config with name linear-learner-2024-02-24-21-06-31-638
INFO:sagemaker:Creating endpoint with name linear-learner-2024-02-24-21-06-31-638


-----------!

In [38]:
# Get the endpoint name
endpoint_name = linear_classifier_predictor.endpoint
# Print or use the endpoint name as needed
print("Endpoint Name:", endpoint_name)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Endpoint Name: linear-learner-2024-02-24-20-27-40-895


In [36]:

# Get predictions for the unseen data
unseen_prediction_batches = [linear_classifier_predictor.predict(batch)
                             for batch in np.array_split(Test.astype('float32'), 25)
                            ]

# Extract and store predicted labels
predicted_labels = []

for batch_predictions in unseen_prediction_batches:
    for pred in batch_predictions:
        # Assuming 'predicted_label' is the key for the predicted label in your prediction response
        predicted_label = pred.label['predicted_label'].float32_tensor.values[0]
        predicted_labels.append(predicted_label)

# Convert the list to a NumPy array if needed
predicted_labels_array = np.array(predicted_labels)

# Print or use the predicted labels array as needed
print(predicted_labels_array)


[1. 1. 1. ... 1. 1. 1.]


In [42]:
count1=0
count2=0
for i in predicted_labels_array:
    if(i==1):
        count1+=1
    else:
        count2+=1
print(len(predicted_labels_array))
print("1s:",count1)
print("0s:",count2)

3263
1s: 1259
0s: 2004


In [43]:
# Convert the NumPy array to a DataFrame
predicted_df = pd.DataFrame({'target': predicted_labels_array})

# Print or use the DataFrame as needed
print(predicted_df)

      target
0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
...      ...
3258     1.0
3259     1.0
3260     1.0
3261     1.0
3262     1.0

[3263 rows x 1 columns]


In [44]:
test_data=pd.read_csv(r'test.csv')

In [45]:
test_data

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [46]:
test_data=test_data.drop(['keyword','location','text'],axis=1)
test_data

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11
...,...
3258,10861
3259,10865
3260,10868
3261,10874


In [47]:
# Concatenate the two DataFrames horizontally (along columns)
concatenated_df = pd.concat([test_data, predicted_df], axis=1)

# Print or use the concatenated DataFrame as needed
print(concatenated_df)

         id  target
0         0     1.0
1         2     1.0
2         3     1.0
3         9     1.0
4        11     1.0
...     ...     ...
3258  10861     1.0
3259  10865     1.0
3260  10868     1.0
3261  10874     1.0
3262  10875     1.0

[3263 rows x 2 columns]
