In [1]:
import pandas as pd
from NlpStudyFunctions import CleanDataframe
from sklearn.model_selection import train_test_split

train_df_raw = pd.read_csv(r'../train.csv')

#clean the df
train_df = CleanDataframe(train_df_raw,'text')

#split train-test set
x_train, x_test, y_train, y_test = train_test_split(train_df['text'], train_df['target'], test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guibe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guibe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Convert lists to NumPy arrays (if not already done)
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

# Number of words to keep based on word frequency.
num_words = 10000

# Using the Tokenizer class from Keras
tokenizer = Tokenizer(num_words=num_words) 
tokenizer.fit_on_texts(x_train)

# Convert x_train and x_test to lists of strings if they are not already in that format
x_train_list = x_train.tolist() if isinstance(x_train, np.ndarray) else x_train
x_test_list = x_test.tolist() if isinstance(x_test, np.ndarray) else x_test

# Convert texts to sequences of integers
x_train_sequences = tokenizer.texts_to_sequences(x_train_list)
x_test_sequences = tokenizer.texts_to_sequences(x_test_list)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in x_train_sequences)

# Pad sequences to the same length
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_sequence_length)
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_sequence_length)

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

#building deep learning model
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=500, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True, dropout=0.2))
model.add(LSTM(512, dropout=0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))


# Explicitly build the model
model.build(input_shape=(None, max_sequence_length))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [4]:
model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 253ms/step - accuracy: 0.6684 - loss: 0.6317 - val_accuracy: 0.7991 - val_loss: 0.4450
Epoch 2/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 236ms/step - accuracy: 0.8749 - loss: 0.3153 - val_accuracy: 0.7840 - val_loss: 0.5103
Epoch 3/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 233ms/step - accuracy: 0.9264 - loss: 0.2007 - val_accuracy: 0.7577 - val_loss: 0.5316
Epoch 4/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 231ms/step - accuracy: 0.9560 - loss: 0.1330 - val_accuracy: 0.7485 - val_loss: 0.9075
Epoch 5/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 229ms/step - accuracy: 0.9666 - loss: 0.0806 - val_accuracy: 0.7610 - val_loss: 1.1693
Epoch 6/10
[1m191/191[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 224ms/step - accuracy: 0.9695 - loss: 0.0633 - val_accuracy: 0.7649 - val_loss: 1.0430
Epoch 7/10

<keras.src.callbacks.history.History at 0x2442fc0a8a0>

In [5]:
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 75ms/step - accuracy: 0.7428 - loss: 1.9388
Test Loss: 1.8623
Test Accuracy: 0.7571


In [20]:
test_df_raw = pd.read_csv(r'../test.csv')

# Clean the df
text_test_df = CleanDataframe(test_df_raw,'text')

# Convert lists to NumPy arrays (if not already done)
text_test_df = np.array(text_test_df)

# Convert back to list
text_test_df = text_test_df.tolist() if isinstance(text_test_df, np.ndarray) else text_test_df

# Convert numbers to strings
text_test_df = [str(item) for item in text_test_df]

# Now, proceed with tokenization
tokenizer.fit_on_texts(text_test_df)

# Fit tokenizer on text
tokenizer.fit_on_texts(text_test_df)

test_df_sequences = tokenizer.texts_to_sequences(text_test_df)

# Calculate the maximum sequence length
max_sequence_length = max(len(x) for x in test_df_sequences)

# Pad sequences to the same length
test_df_padded = pad_sequences(test_df_sequences, maxlen=max_sequence_length)


predictions = model.predict(test_df_padded)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 120ms/step


In [23]:
predictions = pd.DataFrame(predictions)

# Assuming `predictions` is already a DataFrame
predictions.columns = ['column_0', 'column_1']

# Update `column_0` based on the comparison
for index, row in predictions.iterrows():
    if row['column_0'] < row['column_1']:
        predictions.at[index, 'column_0'] = 1
    else:
        predictions.at[index, 'column_0'] = 0

# Convert `column_0` to integer type
predictions['column_0'] = predictions['column_0'].astype(int)

# Create the binary_predictions variable
binary_predictions = predictions['column_0']

# Create the submission DataFrame
submission = pd.DataFrame({'id': test_df_raw['id'], 'target': binary_predictions})

# Display the submission DataFrame
print(submission)


         id  target
0         0       0
1         2       1
2         3       1
3         9       0
4        11       0
...     ...     ...
3258  10861       1
3259  10865       0
3260  10868       0
3261  10874       0
3262  10875       0

[3263 rows x 2 columns]


In [24]:
submission.to_csv(r'../submission.csv',index=False)

In [25]:
submission = pd.read_csv(r'../submission.csv')
submission.describe()

Unnamed: 0,id,target
count,3263.0,3263.0
mean,5427.152927,0.241496
std,3146.427221,0.428056
min,0.0,0.0
25%,2683.0,0.0
50%,5500.0,0.0
75%,8176.0,0.0
max,10875.0,1.0


In [26]:
import kaggle

# Replace 'submission.csv' with the path to your submission file
submission_file = r'../submission.csv'

# Replace 'Message' with your submission message
submission_message = 'DL model try'

# Call the submit function from kaggle package
#kaggle.api.competition_submit(submission_file, submission_message, competition='nlp-getting-started')

100%|██████████| 25.4k/25.4k [00:00<00:00, 46.5kB/s]
  """Creates (aka \&quot;drops\&quot;) a new file into the inbox.  # noqa: E501
  """Creates (aka \&quot;drops\&quot;) a new file into the inbox.  # noqa: E501
  sub_kls = re.match('list\[(.*)\]', klass).group(1)
  sub_kls = re.match('dict\(([^,]*), (.*)\)', klass).group(2)


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Content-Type': 'application/json', 'Date': 'Sun, 09 Jun 2024 14:17:07 GMT', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': '*', 'Set-Cookie': 'ka_sessionid=593e3aba508eb06301b402c524188426; max-age=2626560; path=/, GCLB=CNqF1NXz652mIxAD; path=/; HttpOnly', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'X-Kaggle-MillisecondsElapsed': '70', 'X-Kaggle-RequestId': '9e6c60733a9c5048e456ef804c48cb02', 'X-Kaggle-ApiVersion': '1.6.14', 'X-Kaggle-HubVersion': '0.2.5', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'Content-Security-Policy': "object-src 'none'; script-src 'nonce-BCUnjnSViwj/Z57qqwd+jA==' 'report-sample' 'unsafe-inline' 'unsafe-eval' 'strict-dynamic' https: http:; base-uri 'none'; report-uri https://csp.withgoogle.com/csp/kaggle/20201130; frame-src 'self' https://www.kaggleusercontent.com https://www.youtube.com/embed/ https://polygraph-cool.github.io https://www.google.com/recaptcha/ https://www.docdroid.com https://www.docdroid.net https://kaggle-static.storage.googleapis.com https://kkb-production.jupyter-proxy.kaggle.net https://kkb-production.firebaseapp.com https://kaggle-metastore.firebaseapp.com https://apis.google.com https://content-sheets.googleapis.com/ https://accounts.google.com/ https://storage.googleapis.com https://docs.google.com https://drive.google.com https://calendar.google.com/;", 'X-Content-Type-Options': 'nosniff', 'Referrer-Policy': 'strict-origin-when-cross-origin', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"code":400,"message":"Submission not allowed:  Your team has used its daily Submission allowance (5) today, please try again tomorrow UTC (9.7 hours from now)."}
