In [1]:
import tensorflow as tf 
import pandas as pd 
import numpy as np 
from transformers import pipeline
from transformers import AutoTokenizer , TFAutoModelForSequenceClassification
import boto3 
from sklearn.preprocessing import LabelEncoder
import lxml 
import os
import re 
import string
from dotenv import load_dotenv, find_dotenv




In [None]:
print(f"boto3 : {boto3.__version__}")
print(f"pandas: {pd.__version__}")
print(f"numpy : {np.__version__}")
print(f"tensorflow : {tf.__version__}")

boto3 : 1.40.54
pandas: 2.3.3
numpy : 1.26.4
tensorflow : 2.15.1


In [None]:


path = find_dotenv()# this will find the .env file 
load_dotenv(path) # read the data from .env file 

# Access environment variables
KEY_ID = os.getenv("KEY_ID")
ACCESS_KEY = os.getenv("ACCESS_KEY")
REGION = os.getenv("REGION")

s3_object = boto3.resource( 
    service_name = "s3",
    region_name = REGION,
    aws_access_key_id = KEY_ID,
    aws_secret_access_key = ACCESS_KEY
)



In [4]:
exicute = False 

nw_bckt_nme = "imdb-dataset-11.8" # new bucket name 
file_path = r"M:\\MACHINE LEARNING\\datasets\\IMDB Dataset.csv" # local file path 
s3_file_name = "IMDB Dataset.csv" # this will be remote file name 

if exicute == True: 

    s3_object.create_bucket(
        Bucket = nw_bckt_nme, # unique bucket name 
        CreateBucketConfiguration = {"LocationConstraint": REGION} # fixed the region 
    )

    file_path = r"M:\\MACHINE LEARNING\\datasets\\IMDB Dataset.csv" # local path
    s3_file_name = "IMDB Dataset.csv" # remote file name 

    s3_object.Bucket(nw_bckt_nme).upload_file(file_path,s3_file_name)

else : 
    print("dont exicute")

dont exicute


In [5]:
s3_file_name = "IMDB Dataset.csv"
nw_bckt_nme = "imdb-dataset-11.8"

file_obj = s3_object.Bucket(nw_bckt_nme).Object(s3_file_name).get()
df = pd.read_csv(file_obj['Body'])


In [6]:
print(df.isnull().sum())
print(f"dups: {df.duplicated().sum()}")


review       0
sentiment    0
dtype: int64
dups: 418


In [7]:
df.drop_duplicates(inplace=True)
print(f"dups: {df.duplicated().sum()}")

dups: 0


In [8]:
encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])


In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:

from lxml import html


def clean_text (text): 
    text.lower()

    patters = re.compile(r'https?://\S+|www.\S+')
    patters.sub(" ",text)
    
    puns = string.punctuation
    text.translate(str.maketrans('','',puns))

    doc = html.fromstring(text) 
    return doc.text_content()


df['reviews'] = df['review'].apply(clean_text)




In [11]:
from sklearn.model_selection import train_test_split

y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(df['reviews'],y,test_size = 0.2,random_state = 42)

In [12]:

trnsformr = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(trnsformr)

In [13]:
train_string_data = X_train.astype(str).tolist()
test_string_data = X_test.astype(str).tolist()
# transformer takes the string list not a pandas series . 

train_tokens = tokenizer(
    train_string_data,
    truncation = True,
    padding = True
    )

# transformer takes the string list not a pandas series .
test_tokens = tokenizer(
    test_string_data,
    truncation = True,
    padding = True
    )


In [14]:
# convert token into tensor for training 



train_data = tf.data.Dataset.from_tensor_slices((
    dict(train_tokens),
    y_train.values
)).shuffle(1000).batch(16)

test_data = tf.data.Dataset.from_tensor_slices((
    dict(test_tokens),
    y_test.values
)).batch(16)


In [15]:
# fine tuning transformer for sentiment analysis. 
from transformers import TFAutoModelForSequenceClassification

trnsformr = "bert-base-uncased"
output_label = 2 # possitive and negtive 

model = TFAutoModelForSequenceClassification.from_pretrained(
    trnsformr, # define transformer used for train
    from_pt = True, #convert the weights into tensorflow format to use 
    num_labels= output_label # total outputs 
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 76dbce33-82dc-4e2c-a72f-f6676d9717be)')' thrown while requesting HEAD https://huggingface.co/bert-base-uncased/resolve/main/config.json
Retrying in 1s [Retry 1/5].





TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training arguments will work if we use transformer 

In [16]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     logging_dir="./logs",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_data,
#     eval_dataset=test_data
#     )

# trainer.train()

In [18]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


In [None]:
#history = model.fit(
#     train_data,
#     validation_data=test_data,
#     epochs=1
# )

model.save_pretrained("transformer_model")


In [None]:
# text = ["I love this movie!", "It was terrible."]
# inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True)
# preds = model(inputs)
# probs = tf.nn.softmax(preds.logits, axis=-1)
# print(probs.numpy())
