In [1]:
print("hi")

hi


In [2]:
import sys
import re
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


from twitter_sentiment.constants import TARGET_COLUMN, SCHEMA_FILE_PATH, CURRENT_YEAR
from twitter_sentiment.entity.config_entity import DataTransformationConfig
from twitter_sentiment.entity.artifact_entity import DataTransformationArtifact, DataIngestionArtifact, DataValidationArtifact
from twitter_sentiment.exception import TwetterException
from twitter_sentiment.logger import logging
from twitter_sentiment.utils.main_utils import save_object, save_numpy_array_data, read_yaml_file

In [3]:
port_stem = PorterStemmer()

stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each',
 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
 'mightn', "mightn't", 'mustn', "mustn't", 'needn', 
 "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

tfidf = TfidfVectorizer()

In [4]:
def clean_text(text):  
    pat1 = r'@[^ ]+'                   #@signs and value
    pat2 = r'https?://[A-Za-z0-9./]+'  #links
    pat3 = r'\'s'                      #floating s's
    pat4 = r'\#\w+'                     # hashtags and value
    pat5 = r'&amp '
    pat6 = r'[^A-Za-z\s]'         #remove non-alphabet
    combined_pat = r'|'.join((pat1, pat2,pat3,pat4,pat5, pat6))
    text = re.sub(combined_pat,"",text).lower()
    text = text.split()
    text = [word for word in text if not word in stop ]
    text = [port_stem.stem(word) for word in text]
    text = " ".join(text)
    return text.strip()

In [None]:
class DataTransformation:
    def __init__(self, data_ingestion_artifact: DataIngestionArtifact,
                 data_transformation_config: DataTransformationConfig,
                 data_validation_artifact: DataValidationArtifact):
        """
        :param data_ingestion_artifact: Output reference of data ingestion artifact stage
        :param data_transformation_config: configuration for data transformation
        """
        try:
            self.data_ingestion_artifact = data_ingestion_artifact
            self.data_transformation_config = data_transformation_config
            self.data_validation_artifact = data_validation_artifact
            self._schema_config = read_yaml_file(file_path=SCHEMA_FILE_PATH)
        except Exception as e:
            raise TwetterException(e, sys)
    
    @staticmethod
    def read_data(file_path) -> pd.DataFrame:
        try:
            return pd.read_csv(file_path)
        except Exception as e:
            raise TwetterException(e, sys)


In [44]:
train_df = pd.read_csv("D:\\CDAC\\Project\\Twitter_Sentiment_Analysis\\artifact\\12_29_2024_10_43_04\\data_ingestion\\ingested\\train.csv")
test_df = pd.read_csv("D:\\CDAC\\Project\\Twitter_Sentiment_Analysis\\artifact\\12_29_2024_10_43_04\\data_ingestion\\ingested\\test.csv")

In [45]:
train_df.shape, test_df.shape

((96000, 2), (24000, 2))

In [47]:
 train_df[train_df["polarity"] == 0].shape

(48097, 2)

In [48]:
test_df["polarity"].value_counts()

polarity
1.0    12097
0.0    11903
Name: count, dtype: int64

In [49]:
test_df["tweet"] = test_df["tweet"].apply(clean_text)
train_df["tweet"] = train_df["tweet"].apply(clean_text)


In [20]:
train_data_tweet = tfidf.fit_transform(train_df["tweet"])
test_data_tweet = tfidf.transform(test_df["tweet"])

In [50]:
train_data_tweet.shape

(160000, 67480)

In [53]:
import numpy as np
import scipy.sparse as sp

# Filter out any rows with missing values in the 'tweet' or 'polarity' columns
train_df = train_df.dropna(subset=["tweet", "polarity"])

# Perform the TF-IDF transformation again on the filtered dataframe
train_data_tweet = tfidf.fit_transform(train_df["tweet"])

# Get the polarity column as a numpy array
polarity = train_df["polarity"].values

# Ensure the dimensions match
assert train_data_tweet.shape[0] == polarity.shape[0], "Mismatched dimensions"

# Create a sparse matrix of the polarity column
polarity_sparse = sp.csr_matrix(polarity.reshape(-1, 1))

# Concatenate the sparse matrices
combined_data_sparse = sp.hstack([train_data_tweet, polarity_sparse])

# Check the shape of the combined sparse matrix
print(combined_data_sparse.shape)


(96000, 49841)


In [54]:
import numpy as np

# Calculate the size in bytes
size_in_bytes = combined_data_sparse.data.nbytes

# Convert bytes to megabytes
size_in_mb = size_in_bytes / (1024 ** 2)

# Display the size in MB
print(f"Size of combined_data_sparse: {size_in_mb:.2f} MB")


Size of combined_data_sparse: 5.42 MB


In [35]:
train_data_tweet.shape, train_df["polarity"].shape

((160000, 67480), (160000,))

In [39]:
target_feature_train = np.reshape(train_df["polarity"], (-1, 1))

# Concatenate the arrays
train_data_combined = np.concatenate((train_data_tweet, target_feature_train), axis=1)

print("Shape of train_data_combined:", train_data_combined.shape)


ValueError: zero-dimensional arrays cannot be concatenated

In [40]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

In [42]:
train_df["polarity"].iloc[:10000,:].

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["polarity"].iloc[:10000,:] = 1


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [41]:
lr.fit(train_data_tweet,train_df["polarity"])

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: -1.0