In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import nltk
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
#python -m pip install package_name
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
import matplotlib.pyplot as plt

# Calculate counts of each category
counts = train_data['target'].value_counts()
print(counts)

# Plotting
plt.figure(figsize=(6, 6))
counts.plot(kind='bar')
plt.title('Distribution of Binary Output')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.xticks(rotation=0)  # Rotate x-labels if needed
plt.show()

In [None]:
from twitter_clean import clean_text,text_mislabels
train_data['clean_text']=train_data['text'].apply(clean_text)
test_data['clean_text']=test_data['text'].apply(clean_text)
train_data.to_csv('train_clean.csv',index=False)
test_data.to_csv('test_clean.csv',index=False)

In [None]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")
def preprocessing(text):
    tokens=word_tokenize(text.lower().strip())
    filtered_tokens=[token for token in tokens if token not in stop_words]
    lemmatized_spacy = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]
    return " ".join(lemmatized_spacy)#' '.join(tokens)
train_data['preprocessed_text']=train_data['clean_text'].apply(preprocessing)
test_data['preprocessed_text']=test_data['clean_text'].apply(preprocessing)
train_data.to_csv('train_preprocessed.csv',index=False)
test_data.to_csv('test_preprocessed.csv',index=False)

In [None]:
new_train=text_mislabels(train_data,'text','target')
new_train.to_csv('new_train.csv',index=False)

In [None]:
import matplotlib.pyplot as plt

# Calculate counts of each category
counts = new_train['new_target'].value_counts()
print(counts)

# Plotting
plt.figure(figsize=(6, 6))
counts.plot(kind='bar')
plt.title('Distribution of Binary Output')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.xticks(rotation=0)  # Rotate x-labels if needed
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(new_train['preprocessed_text'], new_train['new_target'], test_size=0.2, stratify=new_train['new_target'],random_state=11)
x_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
tfidf_vect = TfidfVectorizer(analyzer='word')
X_train_tfidf = tfidf_vect.fit_transform(x_train)
X_valid_tfidf = tfidf_vect.transform(x_test)
clf=MultinomialNB()
clf.fit(X_train_tfidf,y_train)
y_pred=clf.predict(X_valid_tfidf)

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

In [None]:
from sklearn.metrics import f1_score,accuracy_score

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Calculate Accuracy
a_s = accuracy_score(y_test, y_pred)
print("Accuracy:", a_s)

In [None]:
X_test_tfidf = tfidf_vect.transform(test_data['preprocessed_text'])
y_pred_final=clf.predict(X_test_tfidf)
tem=pd.read_csv('sample_submission.csv')
tem['target']=y_pred_final
tem['target']=tem['target'].apply(lambda x: 1 if x==1 else 0)
tem.to_csv('test_prediction.csv',index=False)

In [None]:
import re

def remove_hashtags(tweet):
    # Define a pattern to match standalone hashtags
    pattern = r'\B#\w*[a-zA-Z]+\w*'  # This pattern matches hashtags that are not at the beginning of a word

    # Remove hashtags from the tweet
    cleaned_tweet = re.sub(pattern, '', tweet)

    return cleaned_tweet

# Example usage:
tweet = "Check out this cool #python library I found! #coding #programming"
cleaned_tweet = remove_hashtags(tweet)
print("Original Tweet:", tweet)
print("Cleaned Tweet:", cleaned_tweet)


In [None]:
for item in tfidf_vect.get_feature_names_out():
    print(item)