In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load dataset
df = pd.read_csv("Tweets.csv")

In [4]:
# checking dataset
print("Dataset shape:", df.shape)
print("Columns:", df.columns)

Dataset shape: (14640, 15)
Columns: Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')


In [8]:
# Droping unnecessary columns
df = df[['text', 'airline_sentiment']]
# Checking for missing values
print("Missing values:\n", df.isnull().sum())

Missing values:
 text                 0
airline_sentiment    0
dtype: int64


In [9]:
# Mapping sentiment to numerical values
df['sentiment'] = df['airline_sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1})
df

Unnamed: 0,text,airline_sentiment,sentiment
0,@VirginAmerica What @dhepburn said.,neutral,0
1,@VirginAmerica plus you've added commercials t...,positive,1
2,@VirginAmerica I didn't today... Must mean I n...,neutral,0
3,@VirginAmerica it's really aggressive to blast...,negative,-1
4,@VirginAmerica and it's a really big bad thing...,negative,-1
...,...,...,...
14635,@AmericanAir thank you we got on a different f...,positive,1
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative,-1
14637,@AmericanAir Please bring American Airlines to...,neutral,0
14638,"@AmericanAir you have my money, you change my ...",negative,-1


In [13]:
# Function to clean tweets
def preprocess_tweet(text):
    # Convert text to lowercase
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    #Tokenize the text into words
    words = word_tokenize(text)
    #Remove stopwords; you can customize this list or use nltk's stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

In [14]:
# applying preprocessing
df['cleaned_text'] = df['text'].apply(preprocess_tweet)
df

Unnamed: 0,text,airline_sentiment,sentiment,cleaned_text
0,@VirginAmerica What @dhepburn said.,neutral,0,said
1,@VirginAmerica plus you've added commercials t...,positive,1,plus youve added commercials experience tacky
2,@VirginAmerica I didn't today... Must mean I n...,neutral,0,didnt today must mean need take another trip
3,@VirginAmerica it's really aggressive to blast...,negative,-1,really aggressive blast obnoxious entertainmen...
4,@VirginAmerica and it's a really big bad thing...,negative,-1,really big bad thing
...,...,...,...,...
14635,@AmericanAir thank you we got on a different f...,positive,1,thank got different flight chicago
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative,-1,leaving minutes late flight warnings communica...
14637,@AmericanAir Please bring American Airlines to...,neutral,0,please bring american airlines
14638,"@AmericanAir you have my money, you change my ...",negative,-1,money change flight dont answer phones suggest...


In [25]:
# Saving cleaned dataset
df.to_csv("cleaned_tweets.csv", index=False)
print("Preprocessed data saved to cleaned_tweets.csv")

Preprocessed data saved to cleaned_tweets.csv
