In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
train_df = pd.read_json("./data/train.json")
validation_df = pd.read_json("./data/validation.json")
test_df = pd.read_json("./data/test.json")

In [3]:
# Combine the DataFrames
combined_data = pd.concat([train_df, validation_df], ignore_index=True)

# Determine the size for train and validation sets after combining
train_size = int(0.8 * len(combined_data))  # Assuming 80% for training and 20% for validation

# Split the combined data back into train and validation DataFrames
train_data = combined_data[:train_size]
validation_data = combined_data[train_size:]

# Resetting index for both DataFrames
train_data.reset_index(drop=True, inplace=True)
validation_data.reset_index(drop=True, inplace=True)

In [4]:
# Verify the shapes of train and validation data
print("Train data:", train_data['label'].value_counts())
print("Validation data:", validation_data['label'].value_counts())

Train data: label
3    24030
2    21658
0     2162
1     1088
Name: count, dtype: int64
Validation data: label
3    6248
2    5199
0     504
1     284
Name: count, dtype: int64


In [5]:
train_df['combined_sentences'] = train_df['sentence1'] + ' ' + train_df['sentence2']
test_df['combined_sentences'] = test_df['sentence1'] + ' ' + test_df['sentence2']
validation_df['combined_sentences'] = validation_df['sentence1'] + ' ' + validation_df['sentence2']

In [6]:
import re
import spacy.lang.ro.stop_words as stop_words
import spacy

nlp = spacy.load("ro_core_news_sm")
pattern = re.compile(r"[^a-zăâîșț@# ]")

# Convert stop words list to a set for faster lookup
sw_set = set(stop_words.STOP_WORDS)
sw_lower = set(word.lower() for word in stop_words.STOP_WORDS)

def text_preparetion(sentence):
    # 1. Lowercase everything
    sentence = sentence.lower()

    # 2. Remove all symbols other than a-z@#.
    sentence = re.sub(pattern, "", sentence)

    # Tokenize the preprocessed sentence
    tokenization = nlp(sentence)

    # 4. Remove stopwords and empty tokens, and perform stemming
    list_text_preprocessed = [
        token.lemma_ for token in tokenization if 
        # not token.is_stop and 
        token.pos_ != "SPACE"
    ]
    # for token in tokenization:
    #     print(
    #         f"Text: {token.text}, Lemma: {token.lemma_}, POS Tag: {token.pos_}, Dependency: {token.dep_}, "
    #         f"Stop Word?: {token.is_stop}, Entity Type: {token.ent_type_}"
    #     )
    # Join the stemmed tokens back into a sentence
    processed_text = ' '.join(list_text_preprocessed)

    return processed_text


train_df['all_tokens'] = train_df['combined_sentences'].apply(text_preparetion)
test_df['all_tokens'] = test_df['combined_sentences'].apply(text_preparetion)
validation_df['all_tokens'] = validation_df['combined_sentences'].apply(text_preparetion)

In [7]:
train_df.head()

Unnamed: 0,sentence1,sentence2,label,guid,combined_sentences,all_tokens
0,Primul taragotist român a fost Nicolae Luță Io...,"Colegiul de arhitectură, artă și planificare (...",3,7cec5ac4-c115-4976-8d2a-9badfe9b63b9,Primul taragotist român a fost Nicolae Luță Io...,prim taragotist român avea fi nicolae luț iovi...
1,Lupta revoluționarilor este condusă de Avram I...,Schiul nordic face parte din programul olimpic...,3,bc2fa29f-cf22-4a7c-8b55-9b1ed019f6ac,Lupta revoluționarilor este condusă de Avram I...,luptă revoluționar fi conduce de avram iancu i...
2,Locuitorii liberi au devenit „''iobagiones cas...,"În anii 1960, ea a apărut în drame realizate l...",3,8547b1ef-7bfe-43a9-aedf-bad0c0fbc049,Locuitorii liberi au devenit „''iobagiones cas...,locuitor liber avea deveni iobagiones castru i...
3,În anul 2002 are loc lansarea în domeniul turi...,Se lansează primul hotel al grupului în otopen...,2,0ad1ce19-7aa9-4ddd-b8d6-822072a723b0,În anul 2002 are loc lansarea în domeniul turi...,în an avea loc lansare în domeniu turistichote...
4,"Zillich a mijlocit, prin revista ''Klingsor'',...","Au apărut lucrări ale lui ion luca caragiale, ...",2,50c44ffa-b0c1-4d98-bc6c-3bbf95f50896,"Zillich a mijlocit, prin revista ''Klingsor'',...",zillich avea mijloci prin revistă klingsor deb...


In [8]:
X_train = train_df['all_tokens']
y_train = train_df['label']

In [9]:
X_val = validation_df['all_tokens']
y_val = validation_df['label']

In [10]:
X_test = test_df['all_tokens']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_bagofwords = vectorizer.fit_transform(X_train)
X_val_bagofwords = vectorizer.transform(X_val)
X_test_bagofwords = vectorizer.transform(X_test)

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train_bagofwords, y_train)

In [None]:
import joblib

joblib.dump(svm, './models/svm_cu_tokenizare_bagofwords.pkl')

In [None]:
import joblib
svm = joblib.load("./models/svm_cu_tekenizare.pkl")

In [None]:
val_predictions = svm.predict(X_val_bagofwords)

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_val, val_predictions)
print(report)

In [None]:
y_test = svm.predict(X_test_bagofwords)

In [None]:
result_df = pd.DataFrame({
    'guid': test_df['guid'],
    'label': y_test
})

In [None]:
result_df.head()

In [None]:
result_df.to_csv("./data/submission3.csv", index=False)

In [None]:
test_df.size