In [19]:
import csv

def parse_csv(file_path):
    tweets = []
    classes = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        for row in reader:
            tweet, class_ = ','.join(row[:-1]), row[-1]
            tweets.append(tweet)
            classes.append(class_)
    return tweets, classes

train_tweets, train_classes = parse_csv('train.csv')
test_tweets, test_classes = parse_csv('test.csv')

# print (train_tweets[0])
# print (train_classes[0])

In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andi\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [30]:
# Initialize stemmer, lemmatizer, and stop words list
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    # Remove stop words and stem and lemmatize the words
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply the preprocessing to each tweet
train_tweets = [preprocess_text(tweet) for tweet in train_tweets]
test_tweets = [preprocess_text(tweet) for tweet in test_tweets]

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the transform
vectorizer = TfidfVectorizer(max_df=0.50, min_df=5)

# Tokenize and build vocab
vectorizer.fit(train_tweets)

# Encode document
train_vectors = vectorizer.transform(train_tweets)
test_vectors = vectorizer.transform(test_tweets)

In [12]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
    'C': [0.1, 1.0],
    'loss': ['hinge', 'squared_hinge'],
    'multi_class': ['ovr', 'crammer_singer'],
    'penalty': ['l1', 'l2'],
}

# Create a base model
svc = LinearSVC(dual=False, max_iter=3000)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(train_vectors, train_classes)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')


Fitting 3 folds for each of 16 candidates, totalling 48 fits


12 fits failed out of a total of 48.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
                  ^^^^^^^^

Best parameters: {'C': 0.1, 'loss': 'squared_hinge', 'multi_class': 'ovr', 'penalty': 'l1'}


In [13]:
# Train a LinearSVC model with the best parameters
clf = LinearSVC(**best_params, dual=False)
clf.fit(train_vectors, train_classes)

# Predict the classes of the test vectors
test_predictions = clf.predict(test_vectors)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_classes, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7464320866141733


In [32]:
# Train a LinearSVC model with the best parameters
clf = LinearSVC(C=0.1, loss='squared_hinge', multi_class='ovr', penalty= 'l1', dual=False)
clf.fit(train_vectors, train_classes)

# Predict the classes of the test vectors
test_predictions = clf.predict(test_vectors)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_classes, test_predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7464320866141733


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

# Define a pipeline combining a text feature extractor with a simple classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

# Define the parameter grid
param_grid = {
    'tfidf__max_df': [0.25, 0.5, 0.75],
    'tfidf__min_df': [1, 5, 10],
    'clf__C': [0.1],
    'clf__loss': ['hinge', 'squared_hinge'],
    'clf__multi_class': ['ovr'],
    'clf__penalty': ['l1', 'l2'],
    'clf__dual': [True, False],
}

# Instantiate the grid search model
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(train_tweets, train_classes)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Predict the classes of the test vectors
test_predictions = grid_search.predict(test_tweets)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_classes, test_predictions)
print(f'Accuracy: {accuracy}')

Fitting 3 folds for each of 72 candidates, totalling 216 fits


108 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\andi\anaconda3\Lib\site-packages\sklearn\svm\_classes.py", line 274, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                                           ^^^^^^^^^^^^^^^
  File "c:\Users\andi\anaconda3\Lib\site-packages\skle

Best parameters: {'clf__C': 0.1, 'clf__dual': False, 'clf__loss': 'squared_hinge', 'clf__multi_class': 'ovr', 'clf__penalty': 'l1', 'tfidf__max_df': 0.5, 'tfidf__min_df': 1}
Accuracy: 0.7469242125984252
