In [16]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

# Load the training data from JSON file
train_data = pd.read_json('dataset/training.json', lines=True)
train_data.head()

Unnamed: 0,city,section,heading
0,chicago,for-sale,Madden NFL 25 XBOX 360. Brand New!
1,paris.en,housing,looking for room to rent.
2,newyork,for-sale,two DS game
3,seattle,housing,map
4,singapore,services,Good Looking Asian Sensation N aughty Girl ---...


In [13]:
unique_cities = train_data['city'].unique()
unique_categories = train_data['category'].unique()
unique_sections = train_data['section'].unique()

# Print unique values
print("Unique cities:", unique_cities)
print("Unique categories:", unique_categories)
print("Unique sections:", unique_sections)

Unique cities: ['newyork' 'seattle' 'chicago' 'london' 'manchester' 'hyderabad' 'mumbai'
 'delhi' 'singapore' 'bangalore' 'paris.en' 'geneva.en' 'zurich.en'
 'frankfurt.en' 'kolkata.en' 'dubai.en']
Unique categories: ['cell-phones' 'appliances' 'photography' 'video-games' 'housing' 'shared'
 'temporary' 'wanted-housing' 'activities' 'artists' 'childcare' 'general'
 'automotive' 'household-services' 'real-estate' 'therapeutic']
Unique sections: ['for-sale' 'housing' 'community' 'services']


In [14]:
# Preprocess text data
def preprocess_text(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)      
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

train_data['heading'] = preprocess_text(train_data['heading'].str.lower())


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import joblib


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['heading'], train_data['category'], test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Initialize LinearSVC classifier
classifier = LinearSVC()

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Save the trained model
joblib.dump(classifier, 'text_classifier_model.joblib')
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')




['tfidf_vectorizer.joblib']

In [10]:
from sklearn.metrics import accuracy_score, classification_report

# Transform the testing data using the same TF-IDF vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Predict categories for the testing data
y_pred = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 0.7559347181008902
Classification Report:
                    precision    recall  f1-score   support

        activities       0.83      0.49      0.62        79
        appliances       0.90      0.72      0.80       265
           artists       0.76      0.54      0.63        89
        automotive       0.88      0.77      0.82       233
       cell-phones       0.97      0.91      0.94       365
         childcare       0.88      0.72      0.80       239
           general       0.58      0.44      0.50       197
household-services       0.79      0.77      0.78       275
           housing       0.67      0.44      0.53        55
       photography       0.89      0.85      0.87       294
       real-estate       0.77      0.67      0.72       240
            shared       0.42      0.85      0.57       439
         temporary       0.73      0.47      0.57       317
       therapeutic       0.93      0.97      0.95       447
       video-games       0.95      0.82      0.