In [7]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [8]:
dataset_en_hi = load_dataset("open_subtitles", "en-hi")
dataset_da_ru = load_dataset("open_subtitles", "da-ru")

In [9]:
data_en_hi = dataset_en_hi["train"]
data_da_ru = dataset_da_ru["train"]

In [10]:
data_en_hi[0:2]

{'id': ['0', '1'],
 'meta': [{'year': 1948,
   'imdbId': 40522,
   'subtitleId': {'en': 4180294, 'hi': 4239106},
   'sentenceIds': {'en': [1], 'hi': [1]}},
  {'year': 1948,
   'imdbId': 40522,
   'subtitleId': {'en': 4180294, 'hi': 4239106},
   'sentenceIds': {'en': [2], 'hi': [2]}}],
 'translation': [{'en': 'THE BICYCLE THIEF', 'hi': 'साइकिल चोर'},
  {'en': 'Ricci?', 'hi': 'रिच्ची?'}]}

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Extract English subtitles and their corresponding Hindi translations
english_subtitles = [item['en'] for item in data_en_hi['translation']]
hindi_translations = [item['hi'] for item in data_en_hi['translation']]



In [None]:
# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(english_subtitles)
y = hindi_translations

# Define the parameter grid to search through
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear']}

# Initialize and fit SVM model with GridSearchCV
svm_model = GridSearchCV(SVC(), param_grid, cv=3)
svm_model.fit(X, y)

# Best parameters found
print("Best Parameters:", svm_model.best_params_)

# Optionally, you can save the trained model for future use
# from joblib import dump
# dump(svm_model, 'svm_model.joblib')


In [12]:
# Check the unique classes in Hindi translations
unique_classes = set(hindi_translations)
print("Unique classes:", unique_classes)




In [11]:
def prepare_labels(data):
  """
  This function assigns labels based on the presence of keys 'en' and 'hi'.
  """
  labels = []
  for item in data['translation']:
    if 'en' in item:
      labels.append(1)  # Label 1 if 'en' key exists
    elif 'hi' in item and 'en' not in item:
      labels.append(0)  # Label 0 if 'hi' exists (but not 'en')
    else:
      # Handle cases where neither 'en' nor 'hi' exist (modify as needed)
      labels.append(-1)  # Example: assign -1 for undefined label
  return labels


In [12]:

text = []

for item in data_en_hi['translation']:
  text.append(item['en'])  

labels = prepare_labels(data_en_hi)
vectorizer = CountVectorizer()

if len(text) != len(labels):
    raise ValueError("Number of sentences and labels don't match!")

X = vectorizer.fit_transform(text)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [13]:
model = SVC(C=1, kernel='rbf')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


ValueError: The number of classes has to be greater than one; got 1 class

In [10]:


# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


# Step 5: Train the SVM model
model = SVC(C=1, kernel='rbf')
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")


ValueError: The number of classes has to be greater than one; got 1 class

In [8]:
# Check the number of samples in X and labels
print("Number of samples in X:", X.shape[0])
print("Number of samples in labels:", len(labels))


Number of samples in X: 186032
Number of samples in labels: 186032


In [40]:
# Train the SVM model
model = SVC(C=1, kernel='poly', degree=3)  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on testing data: 0.9999
Confusion Matrix:
[[    0     1]
 [    0 18603]]


In [41]:
# Train the SVM model
model = SVC(C=1, kernel='linear')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on testing data: 1.0000
Confusion Matrix:
[[    1     0]
 [    0 18603]]


In [42]:

# Train the SVM model
model = SVC(C=2, kernel='rbf')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Model accuracy on testing data: 0.9999
Confusion Matrix:
[[    0     1]
 [    0 18603]]


In [43]:
# Train the SVM model
model = SVC(C=2, kernel='poly', degree=3)  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on testing data: 0.9999
Confusion Matrix:
[[    0     1]
 [    0 18603]]


In [44]:
# Train the SVM model
model = SVC(C=2, kernel='linear')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on testing data: 1.0000
Confusion Matrix:
[[    1     0]
 [    0 18603]]


In [45]:

# Train the SVM model
model = SVC(C=5, kernel='rbf')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)


Model accuracy on testing data: 0.9999
Confusion Matrix:
[[    0     1]
 [    0 18603]]


In [1]:
# Train the SVM model
model = SVC(C=5, kernel='poly', degree=3)  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

NameError: name 'SVC' is not defined

In [38]:
# Train the SVM model
model = SVC(C=5, kernel='linear')  # You can experiment with different kernels
model.fit(X_train, y_train)

# Use the model for prediction on the testing set
y_pred = model.predict(X_test)

# Evaluate model performance (e.g., accuracy)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on testing data: {accuracy:.4f}")

conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Model accuracy on testing data: 1.0000
Confusion Matrix:
[[    1     0]
 [    0 18603]]
