In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Load the CSV file
data = pd.read_csv("train.csv")

# Preprocess the data
X = data["text"]
y = data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the classification model
model = LogisticRegression(C=1.0, penalty='l2', solver='liblinear')
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_vectorized)

# Evaluate the model
print(classification_report(y_test, predictions))
print(accuracy_score(y_test,predictions))


              precision    recall  f1-score   support

           0       0.80      0.86      0.83       874
           1       0.79      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523

0.7957977675640184


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Load the CSV file
data = pd.read_csv("train.csv")

# Preprocess the data
X = data["text"]
y = data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization, stopword removal, and stemming
stemmer = PorterStemmer()
stopwords = set(stopwords.words("english"))

X_train_processed = X_train.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))
X_test_processed = X_test.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))

# Vectorize the processed text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Train the classification model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_vectorized)

# Evaluate the model
print(classification_report(y_test, predictions))
print(accuracy_score(y_test,predictions))


              precision    recall  f1-score   support

           0       0.79      0.90      0.84       874
           1       0.84      0.67      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523

0.8036769533814839


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Load the CSV file
data = pd.read_csv("train.csv")

# Preprocess the data
X = data["text"]
y = data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization, stopword removal, and stemming
stemmer = PorterStemmer()
stopwords = set(stopwords.words("english"))

X_train_processed = X_train.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))
X_test_processed = X_test.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))

# Vectorize the processed text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Train the classification model (Support Vector Machines)
model = SVC()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_vectorized)

# Evaluate the model
print(classification_report(y_test, predictions))
print(accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       874
           1       0.81      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523

0.7997373604727511


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Load the CSV file
data = pd.read_csv("train.csv")

# Preprocess the data
X = data["text"]
y = data["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization, stopword removal, and stemming
stemmer = PorterStemmer()
stopwords = set(stopwords.words("english"))

X_train_processed = X_train.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))
X_test_processed = X_test.apply(lambda x: " ".join([stemmer.stem(word) for word in word_tokenize(x) if word.lower() not in stopwords]))

# Vectorize the processed text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Train the classification model (Random Forest)
model = RandomForestClassifier()
model.fit(X_train_vectorized, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_vectorized)

# Evaluate the model
print(classification_report(y_test, predictions))
print(accuracy_score(y_test,predictions))


              precision    recall  f1-score   support

           0       0.78      0.89      0.83       874
           1       0.82      0.65      0.73       649

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.79      0.79      0.79      1523

0.7912015758371634
