# Load and exploration of dataset

In [10]:
# Import necessary libraries
import pandas as pd

# Load the training data
train_df = pd.read_csv('train.csv')

# Load the test data
test_df = pd.read_csv('test.csv')

# Display the first few rows of each dataset to understand their structure
print("Training data:")
print(train_df.head())

print("\nTest data:")
print(test_df.head())



Training data:
              id                                               text  \
0   500796286320  Wow! From what I've observed from this documen...   
1   838906157157  काय रे dungnat मेंदु असणाऱ्या आंधभक्ता तुझा आई...   
2  1011026626743  अजित दादा आणि प्रफुल्ल पटेल यांनी केलेल्या काम...   
3  1068853499446  She's saying that "doing her own research" led...   
4   502772748919  That is not Karen, that is perfectly reasonabl...   

   complaint  demands  praise  questions  
0        0.0      0.0     1.0        0.0  
1        1.0      0.0     0.0        0.0  
2        0.0      0.0     1.0        0.0  
3        1.0      0.0     0.0        0.0  
4        1.0      0.0     0.0        0.0  

Test data:
              id                                               text
0  1041016773991  मोदी साहेब वरती चांगले तो पण त्यांच्या साईटला ...
1   109362481297  In #Jawan you will get: 1. 1st class action 2....
2   985019053532  किती दहशत आहे दोन्ही पवारांची बारामती कर मोकळे...
3   436629695381

# Preprocessing the model


In [4]:
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Apply cleaning
train_df['text'] = train_df['text'].apply(clean_text)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['text'])
X = tokenizer.texts_to_sequences(train_df['text'])
X = pad_sequences(X, maxlen=100)

# Labels
y = train_df[['complaint', 'demands', 'praise', 'questions']].values

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Building the model

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Build the model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(4, activation='sigmoid')  # 4 outputs for multi-label classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()




# Training the model

In [6]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32
)


Epoch 1/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 81ms/step - accuracy: 0.4069 - loss: 0.5471 - val_accuracy: 0.4712 - val_loss: 0.5070
Epoch 2/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 71ms/step - accuracy: 0.4353 - loss: 0.5048 - val_accuracy: 0.4550 - val_loss: 0.5067
Epoch 3/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - accuracy: 0.4436 - loss: 0.4993 - val_accuracy: 0.4487 - val_loss: 0.5069
Epoch 4/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 76ms/step - accuracy: 0.5179 - loss: 0.4653 - val_accuracy: 0.4850 - val_loss: 0.5011
Epoch 5/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 79ms/step - accuracy: 0.6143 - loss: 0.3803 - val_accuracy: 0.4737 - val_loss: 0.5444


# Evaluating the modeel

In [7]:
# Evaluate on validation data
val_loss, val_accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.4757 - loss: 0.5446
Validation Loss: 0.5443693399429321, Validation Accuracy: 0.4737499952316284


# Making predictions

In [8]:
# Preprocess the test data
test_df['text'] = test_df['text'].apply(clean_text)
X_test = tokenizer.texts_to_sequences(test_df['text'])
X_test = pad_sequences(X_test, maxlen=100)

# Predict probabilities
predictions = model.predict(X_test)

# Create submission file
submission = pd.DataFrame(predictions, columns=['complaint', 'demands', 'praise', 'questions'])
submission.insert(0, 'id', test_df['id'])
submission.to_csv('submission.csv', index=False)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step
