In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
print("Null values in training data:")
print(train_df.isnull().sum())

print("Null values in testing data:")
print(test_df.isnull().sum())

Null values in training data:
id             0
keyword       61
place       2533
tweet          0
disaster       0
dtype: int64
Null values in testing data:
id            0
keyword      26
place      1105
tweet         0
dtype: int64


In [None]:
train_df['keyword'].fillna('', inplace=True)
train_df['place'].fillna('', inplace=True)
test_df['keyword'].fillna('', inplace=True)
test_df['place'].fillna('', inplace=True)

In [None]:
train_df['text'] = train_df['keyword'] + ' ' + train_df['place'] + ' ' + train_df['tweet']
test_df['text'] = test_df['keyword'] + ' ' + test_df['place'] + ' ' + test_df['tweet']

In [None]:
def preprocess(text):
    return text.lower()

In [None]:
train_df['text'] = train_df['text'].apply(preprocess)
test_df['text'] = test_df['text'].apply(preprocess)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text']).toarray()
y_train = train_df['disaster']
X_test = vectorizer.transform(test_df['text']).toarray()


In [None]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_split, y_train_split)


In [None]:
y_val_pred = rf_model.predict(X_val_split)

In [None]:
print("Random Forest Classification Report:\n", classification_report(y_val_split, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val_split, y_val_pred))


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.90      0.82       874
           1       0.82      0.61      0.70       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.76      1523
weighted avg       0.79      0.78      0.77      1523

Confusion Matrix:
 [[789  85]
 [252 397]]


In [None]:
test_predictions = rf_model.predict(X_test)

In [None]:
submission_df = pd.DataFrame({'id': test_df['id'], 'disaster': test_predictions})
submission_df.to_csv('submission.csv', index=False)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
accuracy = accuracy_score(y_val_split, y_val_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_val_split, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val_split, y_val_pred))

test_predictions = rf_model.predict(X_test)

Accuracy: 0.7787
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.90      0.82       874
           1       0.82      0.61      0.70       649

    accuracy                           0.78      1523
   macro avg       0.79      0.76      0.76      1523
weighted avg       0.79      0.78      0.77      1523

Confusion Matrix:
[[789  85]
 [252 397]]
