In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Load the labeled dataset
global_news_data = pd.read_csv(r"C:\Shivangi\college\Sem 4\MLPR\project\news-more-context\global news data set\rating.csv")
global_news_data = global_news_data[['article_id', 'title', 'title_sentiment']]

# Load the unlabeled dataset
bbc_data = pd.read_csv(r"C:\Shivangi\college\Sem 4\MLPR\project\news-more-context\bbc_news.csv")
bbc_data['title_sentiment'] = -1  # Mark unlabeled data

# Combine the labeled and unlabeled datasets
combined_data = pd.concat([global_news_data, bbc_data[['title', 'title_sentiment']]])

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
X_combined = vectorizer.fit_transform(combined_data['title'])
y_combined = combined_data['title_sentiment'].values

In [None]:
# Split only the global_news_data for evaluation (20%)
# The split is based on the original size of global_news_data to maintain the 80-20 split accurately
train_size = len(global_news_data) - len(bbc_data)
X_train_eval, X_test_eval, y_train_eval, y_test_eval = train_test_split(
    X_combined[:len(global_news_data)], y_combined[:len(global_news_data)], 
    train_size=train_size, random_state=42)

In [None]:
# Initialize the base estimator and the SelfTrainingClassifier
base_estimator = SVC(probability=True)
self_training_model = SelfTrainingClassifier(base_estimator, threshold=0.7, criterion='threshold')

# Train the SelfTrainingClassifier on combined data (80% labeled + unlabeled)
self_training_model.fit(X_combined, y_combined)

In [None]:
# Evaluate the model on the reserved 20% labeled test data
predicted_test = self_training_model.predict(X_test_eval)
print("Evaluation on Test Data:")
print(f"Accuracy: {accuracy_score(y_test_eval, predicted_test)}")
print("Confusion Matrix:")
print(confusion_matrix(y_test_eval, predicted_test))
print("Classification Report:")
print(classification_report(y_test_eval, predicted_test))

In [7]:
import pandas as pd

rating = pd.read_csv(r"C:\Shivangi\college\Sem 4\MLPR\project\news-more-context\global news data set\rating.csv")
data = pd.read_csv(r"C:\Shivangi\college\Sem 4\MLPR\project\news-more-context\global news data set\data.csv")

# create a new data frame with the uncommon rows between the two dataframes
# separate on the basis of article_id
# i only need the title , source_name and title_sentiment columns

new_data = pd.merge(data, rating, on='article_id', how='outer', indicator=True).query('_merge=="left_only"').drop('_merge', axis=1)
print(new_data.shape)
print(new_data.columns)
new_data = new_data[['title_x', 'source_name_x', 'title_sentiment']]
# rename title_x to title
new_data.rename(columns={'title_x': 'title', 'source_name_x':'source_name'}, inplace=True)
print(new_data['title_sentiment'].value_counts())
new_data.to_csv(r"C:\Shivangi\college\Sem 4\MLPR\project\news-more-context\global news data set\uncommon_data.csv", index=False)

raw_data = 

(47249, 24)
Index(['article_id', 'source_id_x', 'source_name_x', 'author_x', 'title_x',
       'description_x', 'url_x', 'url_to_image_x', 'published_at_x',
       'content_x', 'category_x', 'full_content', 'source_id_y',
       'source_name_y', 'author_y', 'title_y', 'description_y', 'url_y',
       'url_to_image_y', 'published_at_y', 'content_y', 'category_y',
       'article', 'title_sentiment'],
      dtype='object')
Series([], Name: count, dtype: int64)


In [None]:
# extract all positive and negative sentiment data
from translate import Translator

def back_translate(series, to_lang='de', from_lang='en'):
    # Initialize translators
    translator_to = Translator(to_lang=to_lang, from_lang=from_lang)
    translator_back = Translator(to_lang=from_lang, from_lang=to_lang)

    back_translated_series = series.apply(lambda x: translator_back.translate(translator_to.translate(x)))
    return back_translated_series

# Extract positive and negative sentiment data from rating.csv
positive_data = rating[rating['title_sentiment'] == 1]
negative_data = rating[rating['title_sentiment'] == 0]

# Back translate the positive and negative sentiment data
positive_data['title'] = back_translate(positive_data['title'])
negative_data['title'] = back_translate(negative_data['title'])
