In [None]:
# Import necessary libraries
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import re
import logging
from flask import Flask, request, jsonify, render_template
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from dash import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output
import plotly.express as px
import streamlit as st
!pip install contractions
from contractions import fix
# Load the spaCy model for NLP tasks
nlp = spacy.load('en_core_web_sm')

# Initialize the Flask app
flask_app = Flask(__name__)
logging.basicConfig(level=logging.INFO)

# Load train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Define a function to process data in chunks
def process_in_batches(data, batch_size=100):
    # Create an empty list to store processed texts
    processed_texts = []

    # Process data in batches
    for start in range(0, len(data), batch_size):
        end = min(start + batch_size, len(data))  # Make sure we don't exceed the length
        batch = data[start:end]

        # Apply the text cleaning function to each batch
        processed_batch = [enhanced_clean_text(text) for text in batch]

        # Append the results
        processed_texts.extend(processed_batch)

    return processed_texts

# Apply the batch processing to the 'text' column
train_data_sample = train_data.sample(1000)


# Create vectors for text data using spaCy embeddings
def create_vec(dataframe):
    texts = dataframe['text'].tolist()
    vectors = [list(doc.vector) for doc in nlp.pipe(texts)]
    vec_df = pd.DataFrame(vectors, columns=[f'vec_{i}' for i in range(len(vectors[0]))])
    return vec_df

vec_train = create_vec(train_data)
vec_test = create_vec(test_data)
train_data = pd.concat([train_data, vec_train], axis=1)
test_data = pd.concat([test_data, vec_test], axis=1)

# Drop unnecessary columns
columns_to_drop = ['keyword', 'location', 'text']
train_data.drop(columns=[col for col in columns_to_drop if col in train_data.columns], inplace=True)
test_data.drop(columns=[col for col in columns_to_drop if col in test_data.columns], inplace=True)

# Split training data
X_train, X_valid, y_train, y_valid = train_test_split(
    train_data.drop('target', axis=1), train_data['target'], test_size=0.2, random_state=42
)
# Convert 'vec_0' column to string type before fitting the model
X_train['vec_0'] = X_train['vec_0'].astype(str)
X_valid['vec_0'] = X_valid['vec_0'].astype(str) # Do the same for X_valid to avoid issues during prediction

# Naive Bayes pipeline
nb_model = make_pipeline(TfidfVectorizer(), MultinomialNB())
nb_model.fit(X_train['vec_0'], y_train)

# Evaluate Naive Bayes model
y_pred = nb_model.predict(X_valid['vec_0'])
print("Naive Bayes Accuracy:", accuracy_score(y_valid, y_pred))
print("Naive Bayes Classification Report:\n", classification_report(y_valid, y_pred))

# Build an LSTM model
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train['vec_0'])
X_train_seq = tokenizer.texts_to_sequences(X_train['vec_0'])
X_valid_seq = tokenizer.texts_to_sequences(X_valid['vec_0'])
lstm_model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
max_length = 100  # Adjust this based on your data
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_length, padding='post', truncating='post')


lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_pad, y_train, validation_data=(X_valid_pad, y_valid), epochs=5, batch_size=64)


# Flask routes
@flask_app.route('/')
def home():
    return render_template('index.html')

@flask_app.route('/predict', methods=['POST'])
def predict():
    input_text = request.form['text']
    cleaned_text = enhanced_clean_text(input_text)
    vector = [nlp(cleaned_text).vector]
    prediction = nb_model.predict(vector)
    return jsonify({'prediction': int(prediction[0])})

@flask_app.route('/bulk_predict', methods=['POST'])
def bulk_predict():
    try:
        tweets = request.json.get('tweets', [])
        cleaned_tweets = [enhanced_clean_text(tweet) for tweet in tweets]
        vectors = [nlp(tweet).vector for tweet in cleaned_tweets]
        predictions = nb_model.predict(vectors)
        return jsonify({'predictions': predictions.tolist()})
    except Exception as e:
        logging.error(f"Bulk prediction failed: {e}")
        return jsonify({'error': str(e)}), 500

# Streamlit app for user interface
st.title("Disaster Tweets Classification")
user_input = st.text_area("Enter a tweet to classify:")
if st.button("Predict"):
    cleaned_input = enhanced_clean_text(user_input)
    vector = [nlp(cleaned_input).vector]
    prediction = nb_model.predict(vector)
    st.write("Prediction:", "Disaster" if prediction[0] else "Not Disaster")

# Run the Flask app
! pip install Flask
if __name__ == '__main__':
    flask_app.run(debug=True)

