In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score


In [2]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
train_df[train_df['target'] == 1]['text'].values[78]

'I had a airplane accident.'

In [7]:
train_df[train_df['target'] == 0]['text'].values[40]

'http://t.co/GKYe6gjTk5 Had a #personalinjury accident this summer? Read our advice &amp; see how a #solicitor can help #OtleyHour'

In [8]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [9]:
train_df['modified_text'] = train_df['text'].apply(clean_text)

In [10]:
train_df[train_df['target'] == 0]['modified_text'].values[40]

'had a accident this summer read our advice amp see how a can help'

In [11]:
test_df['modified_text'] = test_df['text'].apply(clean_text)

In [12]:
tfidf = TfidfVectorizer(
    max_features = 5000,
    ngram_range=(1,2)
)

In [13]:
X_train = tfidf.fit_transform(train_df['modified_text'])
Y_train = train_df['target']

In [14]:
X_test = tfidf.transform(test_df['modified_text'])

In [15]:
model = LogisticRegression()

In [16]:
model.fit(X_train,Y_train)

In [17]:
pred = model.predict(X_test)

In [18]:
submission = pd.DataFrame({
    'id':test_df['id'],
    'target':pred
})

In [19]:
submission.to_csv('submission.csv', index=False)