In [None]:
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# 1. Load and preprocessing

In [None]:
# The csv file is loaded as a pandas dataframe object.
# Only the useful columns for the problem are kept.
df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
df = df[["text", "target"]]


# The text is processed in the following way:
# - Each sentence in transformed to lowercase
# - Punctuation is removed
df["text"] = df["text"].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

# Split into train and test by considering class imbalance
x_train, x_test, y_train, y_test = train_test_split(df["text"], df["target"], test_size=0.3, stratify=df["target"])

# 2. Model: Mutinomial Naive Bayes

In [None]:
# The model is embedded in a pipeline where the first module is the 
# TF-IDF vectorization followed by the MultinomialNB model.
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(x_train, y_train)

In [None]:
# Calculate predictions and display metrics
y_pred = model.predict(x_test)
print(f"{classification_report(y_test, y_pred)}")

# Since we are using a commonly used model as a baseline 
# for NLP problems, the resulting accuracy is adequate.

# 3. Submision

In [None]:
# Loading and processing of test.csv file. Then, useful columns 
# are then kept. Text is preprocessed in the same way as the training set.
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
df_test = df_test[["id", "text"]]
df_test["text"] = df["text"].apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))

In [None]:
# Add predictions as a new feature
df_test["target"] = model.predict(df_test["text"])
print(df_test.head(10))

In [None]:
# Export.
# df_test[["id", "target"]].to_csv("/kaggle/input/nlp-getting-started/submision.csv", index=False)