In [2]:
# Importing necessary libraries from the NLTK toolkit
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Extra support for tokenization
nltk.download('stopwords')  # Predefined stopword lists for various languages

! pip install pandas
# Importing pandas for working with data in tabular format 
import pandas as pd
# Loading the CSV dataset into a DataFrame
df = pd.read_csv("data.csv")  # CSV contains columns like 'Message' and labels indicating spam or not

# Step to clean the text data:
# - Removing punctuation, special characters, and multiple spaces
# - Preparing data for tokenization and further text processing

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [3]:
import nltk
nltk.download('stopwords')

import re  # Regular expressions for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Sentence']:  # Looping through each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing all characters except words and spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Stripping leading and trailing whitespace
    cleaned.append(cleaned_data)  # Adding the cleaned text to the list

# Tokenizing the cleaned text into words
# This step splits each cleaned text into a list of words
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from tokenized words
# Stopwords are commonly used words like "is", "the", "and", etc., which are removed to reduce noise
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stop_token = []  # List to store stopword-removed tokens
for k in range(len(df['Sentence'])):  # Loop through the tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filter out tokens that are in the stopword list
    stop_token.append(p)  # Append the filtered tokens to the list

# Summary of steps:
# 1. Dataset is downloaded and loaded into a pandas DataFrame.
# 2. Text messages are cleaned by removing punctuation, special characters, and extra spaces.
# 3. The cleaned text is tokenized into words.
# 4. Stopwords are removed to focus on meaningful words.

[nltk_data] Downloading package stopwords to C:\Users\VIGNESH
[nltk_data]     VARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
tokens

[['The',
  'GeoSolutions',
  'technology',
  'will',
  'leverage',
  'Benefon',
  's',
  'GPS',
  'solutions',
  'by',
  'providing',
  'Location',
  'Based',
  'Search',
  'Technology',
  'a',
  'Communities',
  'Platform',
  'location',
  'relevant',
  'multimedia',
  'content',
  'and',
  'a',
  'new',
  'and',
  'powerful',
  'commercial',
  'model'],
 ['ESI',
  'on',
  'lows',
  'down',
  '150',
  'to',
  '250',
  'BK',
  'a',
  'real',
  'possibility'],
 ['For',
  'the',
  'last',
  'quarter',
  'of',
  '2010',
  'Componenta',
  's',
  'net',
  'sales',
  'doubled',
  'to',
  'EUR131m',
  'from',
  'EUR76m',
  'for',
  'the',
  'same',
  'period',
  'a',
  'year',
  'earlier',
  'while',
  'it',
  'moved',
  'to',
  'a',
  'zero',
  'pretax',
  'profit',
  'from',
  'a',
  'pretax',
  'loss',
  'of',
  'EUR7m'],
 ['According',
  'to',
  'the',
  'FinnishRussian',
  'Chamber',
  'of',
  'Commerce',
  'all',
  'the',
  'major',
  'construction',
  'companies',
  'of',
  'Finland',


In [5]:
tokens

[['The',
  'GeoSolutions',
  'technology',
  'will',
  'leverage',
  'Benefon',
  's',
  'GPS',
  'solutions',
  'by',
  'providing',
  'Location',
  'Based',
  'Search',
  'Technology',
  'a',
  'Communities',
  'Platform',
  'location',
  'relevant',
  'multimedia',
  'content',
  'and',
  'a',
  'new',
  'and',
  'powerful',
  'commercial',
  'model'],
 ['ESI',
  'on',
  'lows',
  'down',
  '150',
  'to',
  '250',
  'BK',
  'a',
  'real',
  'possibility'],
 ['For',
  'the',
  'last',
  'quarter',
  'of',
  '2010',
  'Componenta',
  's',
  'net',
  'sales',
  'doubled',
  'to',
  'EUR131m',
  'from',
  'EUR76m',
  'for',
  'the',
  'same',
  'period',
  'a',
  'year',
  'earlier',
  'while',
  'it',
  'moved',
  'to',
  'a',
  'zero',
  'pretax',
  'profit',
  'from',
  'a',
  'pretax',
  'loss',
  'of',
  'EUR7m'],
 ['According',
  'to',
  'the',
  'FinnishRussian',
  'Chamber',
  'of',
  'Commerce',
  'all',
  'the',
  'major',
  'construction',
  'companies',
  'of',
  'Finland',


In [6]:

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\VIGNESH
[nltk_data]     VARMA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
stem_vec = [' '.join(text) for text in stemedata]

NameError: name 'stemedata' is not defined

In [None]:
stem_vec

['the geosolut technolog leverag benefon gp solut provid locat base search technolog commun platform locat relev multimedia content new power commerci model',
 'esi low 150 250 bk real possibl',
 'for last quarter 2010 componenta net sale doubl eur131m eur76m period year earlier move zero pretax profit pretax loss eur7m',
 'accord finnishrussian chamber commerc major construct compani finland oper russia',
 'the swedish buyout firm sold remain 224 percent stake almost eighteen month take compani public finland',
 'spi wouldnt surpris see green close',
 'shell 70 billion bg deal meet sharehold skeptic',
 'ssh commun secur corp stock exchang releas octob 14 2008 at 245 pm the compani updat full year outlook estim result remain loss full year',
 'kone net sale rose 14 yearonyear first nine month 2008',
 'the stockmann depart store total floor space 8000 squar metr stockmann invest project price tag eur 12 million',
 'circul revenu increas 5 finland 4 sweden 2008',
 'sap q1 disappoint soft

In [None]:
x=cv.fit_transform(stem_vec).toarray()

In [None]:
#importing multinomial
from sklearn.naive_bayes import MultinomialNB

In [None]:
y=df['Sentiment']

In [None]:
mb = MultinomialNB()

In [None]:
mb.fit(x,y)

In [None]:
x[0]

array([1, 0, 0])

In [None]:
df['Sentence'][0]

'positive'

In [None]:
mb.predict([x[1]])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x_vec,y,test_size=0.25)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgr = LogisticRegression()

In [None]:
lgr.fit(X_train,y_train)

In [None]:
import pickle
with open("vectorizer.pickle","wb") as mdl_file:
    pickle.dump(x,mdl_file)