In [37]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [38]:
train_df = pd.read_json("./data/train.json")
validation_df = pd.read_json("./data/validation.json")
test_df = pd.read_json("./data/test.json")

In [39]:
# Combine the DataFrames
combined_data = pd.concat([train_df, validation_df], ignore_index=True)

# Determine the size for train and validation sets after combining
train_size = int(0.8 * len(combined_data))  # Assuming 80% for training and 20% for validation

# Split the combined data back into train and validation DataFrames
train_data = combined_data[:train_size]
validation_data = combined_data[train_size:]

# Resetting index for both DataFrames
train_data.reset_index(drop=True, inplace=True)
validation_data.reset_index(drop=True, inplace=True)

In [40]:
# Verify the shapes of train and validation data
print("Train data:", train_data['label'].value_counts())
print("Validation data:", validation_data['label'].value_counts())

Train data: label
3    24030
2    21658
0     2162
1     1088
Name: count, dtype: int64
Validation data: label
3    6248
2    5199
0     504
1     284
Name: count, dtype: int64


In [41]:
train_df['combined_sentences'] = train_df['sentence1'] + ' ' + train_df['sentence2']
test_df['combined_sentences'] = test_df['sentence1'] + ' ' + test_df['sentence2']
validation_df['combined_sentences'] = validation_df['sentence1'] + ' ' + validation_df['sentence2']

In [42]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import spacy

ss = SnowballStemmer("romanian")
sw = stopwords.words("romanian")
nlp = spacy.load("ro_core_news_sm")

def text_preparetion(sentence):
    # 1. Lowercase everything
    sentence = sentence.lower()

    # 2. Remove all symbols other than a-z@#.
    sentence = re.sub(r"[^a-zăâîșț@# ]", "", sentence)

    # Tokenize the preprocessed sentence
    tokenization = nlp(sentence)

    # 4. Remove stopwords and empty tokens, and perform stemming
    list_text_preprocessed = [
        ss.stem(word.text) for word in tokenization if word.text not in sw and word.pos_ != "SPACE"
    ]

    # Join the stemmed tokens back into a sentence
    processed_text = ' '.join(list_text_preprocessed)

    return processed_text

train_df['all_tokens'] = train_df['combined_sentences'].apply(text_preparetion)
test_df['all_tokens'] = test_df['combined_sentences'].apply(text_preparetion)
validation_df['all_tokens'] = validation_df['combined_sentences'].apply(text_preparetion)

In [43]:
train_df.head()

Unnamed: 0,sentence1,sentence2,label,guid,combined_sentences,all_tokens
0,Primul taragotist român a fost Nicolae Luță Io...,"Colegiul de arhitectură, artă și planificare (...",3,7cec5ac4-c115-4976-8d2a-9badfe9b63b9,Primul taragotist român a fost Nicolae Luță Io...,taragotist român nicola luț ioviț originar ban...
1,Lupta revoluționarilor este condusă de Avram I...,Schiul nordic face parte din programul olimpic...,3,bc2fa29f-cf22-4a7c-8b55-9b1ed019f6ac,Lupta revoluționarilor este condusă de Avram I...,lupt revoluționar condus avram iancu ioan ciur...
2,Locuitorii liberi au devenit „''iobagiones cas...,"În anii 1960, ea a apărut în drame realizate l...",3,8547b1ef-7bfe-43a9-aedf-bad0c0fbc049,Locuitorii liberi au devenit „''iobagiones cas...,locuit liber deven iobagiones castr iobag cetă...
3,În anul 2002 are loc lansarea în domeniul turi...,Se lansează primul hotel al grupului în otopen...,2,0ad1ce19-7aa9-4ddd-b8d6-822072a723b0,În anul 2002 are loc lansarea în domeniul turi...,an loc lans domen turistichotelier lans hotel ...
4,"Zillich a mijlocit, prin revista ''Klingsor'',...","Au apărut lucrări ale lui ion luca caragiale, ...",2,50c44ffa-b0c1-4d98-bc6c-3bbf95f50896,"Zillich a mijlocit, prin revista ''Klingsor'',...",zillich mijloc revist klingsor debut multor ti...


In [44]:
X_train = train_df['all_tokens']
y_train = train_df['label']

In [45]:
X_val = validation_df['all_tokens']
y_val = validation_df['label']

In [46]:
X_test = test_df['all_tokens']

In [47]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [48]:
X_test_tfidf = vectorizer.transform(X_test)

In [33]:
svm = SVC(kernel='linear')
svm.fit(X_train_tfidf, y_train)

In [34]:
import joblib

joblib.dump(svm, './models/svm_cu_tekenizare_ugraded.pkl')

['./models/svm_cu_tekenizare_ugraded.pkl']

In [35]:
import joblib
svm = joblib.load("./models/svm_cu_tekenizare.pkl")

In [49]:
val_predictions = svm.predict(X_val_tfidf)

ValueError: X has 129810 features, but SVC is expecting 191634 features as input.

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_val, val_predictions)
print(report)

In [None]:
accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {accuracy}")

In [None]:
y_test = svm.predict(X_test_tfidf)

In [None]:
result_df = pd.DataFrame({
    'guid': test_df['guid'],
    'label': y_test
})

In [None]:
result_df.head()

In [None]:
result_df.to_csv("./data/submission3.csv", index=False)

In [None]:
test_df.size