<a href="https://colab.research.google.com/github/sammainahkinya1404/Programming-Reboot/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Common NLP Tasks:
# Tokenization: Breaking text into words or sentences.
# Stopword Removal: Removing common words (e.g., "the", "is") that do not add significant meaning.
# Lemmatization/Stemming: Reducing words to their base or root form.

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Natural Language Processing (NLP) is a fascinating field of study."

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Stopword Removal
filtered_tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
print("Filtered Tokens:", filtered_tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'study', '.']
Filtered Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'study', '.']
Lemmatized Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'study', '.']


In [2]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample text data
sentences = [
    "Natural language processing is a subfield of artificial intelligence.",
    "Word embeddings are a type of word representation.",
    "Text classification is a common NLP task."
]

# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=1)

# Get vector for a word
vector = model.wv['nlp']
print("Word Vector for 'nlp':", vector)


Word Vector for 'nlp': [ 9.7702928e-03  8.1651136e-03  1.2809718e-03  5.0975787e-03
  1.4081288e-03 -6.4551616e-03 -1.4280510e-03  6.4491653e-03
 -4.6173059e-03 -3.9930656e-03  4.9244044e-03  2.7130984e-03
 -1.8479753e-03 -2.8769434e-03  6.0107317e-03 -5.7167388e-03
 -3.2367026e-03 -6.4878250e-03 -4.2346325e-03 -8.5809948e-03
 -4.4697891e-03 -8.5112294e-03  1.4037776e-03 -8.6181965e-03
 -9.9166557e-03 -8.2016252e-03 -6.7726658e-03  6.6805850e-03
  3.7845564e-03  3.5616636e-04 -2.9579818e-03 -7.4283206e-03
  5.3341867e-04  4.9989222e-04  1.9561886e-04  8.5259555e-04
  7.8633073e-04 -6.8160298e-05 -8.0070542e-03 -5.8702733e-03
 -8.3829118e-03 -1.3120425e-03  1.8206370e-03  7.4171280e-03
 -1.9634271e-03 -2.3252917e-03  9.4871549e-03  7.9704521e-05
 -2.4045217e-03  8.6048469e-03  2.6870037e-03 -5.3439722e-03
  6.5881060e-03  4.5101536e-03 -7.0544672e-03 -3.2317400e-04
  8.3448651e-04  5.7473574e-03 -1.7176545e-03 -2.8065301e-03
  1.7484308e-03  8.4717153e-04  1.1928272e-03 -2.6342822e-03
 

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Sample input and output sequences
input_texts = ["hello", "how are you?"]
output_texts = ["hola", "¿cómo estás?"]

# Define model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=10))
model.add(LSTM(128))
model.add(Dense(1000, activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Summary of the model
model.summary()




In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Sample text data
texts = ["I love programming", "Python is great", "I dislike bugs", "Debugging is fun"]
labels = [1, 1, 0, 1]  # 1 for positive sentiment, 0 for negative

# Convert texts to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25)

# Train classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 1.0


In [6]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

# Define model
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Summary of the model
model.summary()


In [8]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize input
inputs = tokenizer(["I love NLP", "NLP is challenging"], return_tensors="pt", padding=True, truncation=True)

# Define training arguments and train model
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, per_device_train_batch_size=8)
trainer = Trainer(model=model, args=training_args, train_dataset=inputs)
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [11]:
from sklearn.metrics import classification_report, accuracy_score

# Example true and predicted labels
true_labels = [1, 0, 1, 1, 0, 1]
predicted_labels = [1, 0, 0, 1, 0, 1]

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

# Detailed classification report
report = classification_report(true_labels, predicted_labels)
print("Classification Report:\n", report)


Accuracy: 0.8333333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.75      0.86         4

    accuracy                           0.83         6
   macro avg       0.83      0.88      0.83         6
weighted avg       0.89      0.83      0.84         6

