In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from pylab import rcParams
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
from nltk import pos_tag


In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to C:\Users\RAJESH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\RAJESH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to C:\Users\RAJESH
[nltk_data]     KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\RAJESH KUMAR\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
###### Import data #####

In [12]:
train=pd.read_csv("C:\\Users\\RAJESH KUMAR\\Documents\\disaster tweets\\train.csv")
test=pd.read_csv("C:\\Users\\RAJESH KUMAR\\Documents\\disaster tweets\\test.csv")

In [13]:
train.shape

(7613, 5)

In [14]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [23]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [24]:
def preprocess_text(text):
    words = word_tokenize(text.lower())  # Tokenization and lowercase
    words = [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words] 
    return words

In [25]:
train['cleaned_text'] = train['text'].apply(preprocess_text)


In [26]:
# Frequency distribution of words
all_words = [word for words in train['cleaned_text'] for word in words]
fdist = FreqDist(all_words)
print("Most common words:", fdist.most_common(10))


Most common words: [('http', 4716), ('like', 409), ('fire', 356), ('amp', 344), ('get', 309), ('bomb', 230), ('new', 223), ('via', 218), ('one', 204), ('peopl', 198)]


In [27]:
# Sentiment analysis
sia = SentimentIntensityAnalyzer()

In [28]:
def get_sentiment_score(text):
    sentiment = sia.polarity_scores(text)
    return sentiment

In [29]:
train['sentiment'] = train['text'].apply(get_sentiment_score)

In [30]:
# Part-of-speech tagging
def get_pos_tags(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    return pos_tags

In [31]:
train['pos_tags'] = train['text'].apply(get_pos_tags)

In [33]:
# Display the processed DataFrame
train.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,sentiment,pos_tags
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deed, reason, earthquak, may, allah, forgiv, us]","{'neg': 0.0, 'neu': 0.851, 'pos': 0.149, 'comp...","[(Our, PRP$), (Deeds, NNS), (are, VBP), (the, ..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, rong, sask, canada]","{'neg': 0.286, 'neu': 0.714, 'pos': 0.0, 'comp...","[(Forest, NNP), (fire, NN), (near, IN), (La, N..."
2,5,,,All residents asked to 'shelter in place' are ...,1,"[resid, ask, place, notifi, offic, evacu, shel...","{'neg': 0.095, 'neu': 0.905, 'pos': 0.0, 'comp...","[(All, DT), (residents, NNS), (asked, VBD), (t..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[peopl, receiv, wildfir, evacu, order, califor...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","[(13,000, CD), (people, NNS), (receive, JJ), (..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, rubi, alaska, smoke, wildfi...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","[(Just, RB), (got, VBN), (sent, VBD), (this, D..."


In [50]:

# Sentiment analysis function
def get_sentiment_label(score):
    if score['compound'] >= 0.05:
        return '1'
    elif score['compound'] <= -0.05:
        return '0'
   

In [51]:
# Apply sentiment analysis on test data
test['sentiment_scores'] = test['text'].apply(lambda x: sia.polarity_scores(x))
test['target'] = test['sentiment_scores'].apply(get_sentiment_label)

In [52]:
test.head()

Unnamed: 0,id,keyword,location,text,sentiment_scores,sentiment_label,target
0,0,,,Just happened a terrible car crash,"{'neg': 0.659, 'neu': 0.341, 'pos': 0.0, 'comp...",0,0.0
1,2,,,"Heard about #earthquake is different cities, s...","{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp...",1,1.0
2,3,,,"there is a forest fire at spot pond, geese are...","{'neg': 0.251, 'neu': 0.749, 'pos': 0.0, 'comp...",0,0.0
3,9,,,Apocalypse lighting. #Spokane #wildfires,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neutral,
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"{'neg': 0.333, 'neu': 0.667, 'pos': 0.0, 'comp...",0,0.0


In [53]:
# Extract 'id' and 'target' columns
extracted_data = test[['id', 'target']]

In [48]:
# Define the file path for the CSV file
csv_file_path = 'sample_extracted_data.csv'

# Write to a CSV file
extracted_data.to_csv('extracted_data.csv', index=False)

In [49]:
print("Sample extracted DataFrame saved to CSV file:", csv_file_path)

Sample extracted DataFrame saved to CSV file: sample_extracted_data.csv


In [45]:
extracted_data.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,0
3,9,neutral
4,11,0
