importing necessary library

In [1]:
import pandas as pd

importing test file

In [2]:
lines_details = []

with open("msr_paraphrase_test.txt", "r") as file:
    for line in file:
        try:
            fields_detail = line.strip().split('\t')
            lines_details.append(fields_detail)
        except Exception as e:
            print("Parsing error occurred in specific line number:", line)
            print("Error:", e)
columns = ['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String']
df = pd.DataFrame(lines_details, columns=columns)

print("\nDataset Details:")
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])
print("\nColumn names:")
print(df.columns)
print("\nData types:")
print(df.dtypes)



Dataset Details:
Number of rows: 1726
Number of columns: 5

Column names:
Index(['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'], dtype='object')

Data types:
Quality      object
#1 ID        object
#2 ID        object
#1 String    object
#2 String    object
dtype: object


Details of test dataset

In [3]:
df = df.tail(-1)
df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
1,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
2,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
3,1,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
4,0,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
5,0,3354381,3354396,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...


Importing train dataset

In [4]:
lines_value = []

with open("msr_paraphrase_train.txt", "r") as file:
    for line in file:
        try:
            fields_require = line.strip().split('\t')
            lines_value.append(fields_require)
        except Exception as e:
            print("Parsing error occurred in line number:", line)
            print("Error:", e)
columns = ['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String']
df_train = pd.DataFrame(lines_value, columns=columns)

print("\nDataset Details:")
print("Number of rows:", df_train.shape[0])
print("Number of columns:", df_train.shape[1])
print("\nColumn names:")
print(df_train.columns)
print("\nData types:")
print(df_train.dtypes)


Dataset Details:
Number of rows: 4077
Number of columns: 5

Column names:
Index(['Quality', '#1 ID', '#2 ID', '#1 String', '#2 String'], dtype='object')

Data types:
Quality      object
#1 ID        object
#2 ID        object
#1 String    object
#2 String    object
dtype: object


Details of the train dataset

In [5]:
df_train = df_train.tail(-1)
df_train.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
1,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
2,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
3,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
4,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
5,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


Importing necessary library

In [6]:
import nltk
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data preprocessing

In [8]:
def preprocess(text_value):
    tokens_score = nltk.word_tokenize(text_value)
    tokens_score = [token.lower() for token in tokens_score if token.isalnum() and token.lower() not in set(nltk.corpus.stopwords.words('english'))]
    return ' '.join(tokens_score)

In [9]:
df_train['Preprocessed'] = df_train['#1 String'].apply(preprocess) + ' ' + df_train['#2 String'].apply(preprocess)
df['Preprocessed'] = df['#1 String'].apply(preprocess) + ' ' + df['#2 String'].apply(preprocess)

In [10]:
tfidf_vectorizer = TfidfVectorizer()
train_msr_tfidf = tfidf_vectorizer.fit_transform(df_train['Preprocessed'])
test_msr_tfidf = tfidf_vectorizer.transform(df['Preprocessed'])

Find cosine similarity

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
cosine = cosine_similarity(test_msr_tfidf, train_msr_tfidf)

LEX rank process

In [12]:
import networkx as nx
def congraph(similarity_matrix, threshold=0.8):
    graph = nx.Graph()
    n_msr_test, n_msr_train = similarity_matrix.shape
    for i in range(n_msr_test):
        for j in range(n_msr_train):
            if similarity_matrix[i][j] > threshold:
                graph.add_edge(f"test_{i}", f"train_{j}", weight=similarity_matrix[i][j])
    return graph

graph = congraph(cosine)

In [13]:
scores_value = nx.pagerank(graph)

In [14]:
scores_value

{'test_9': 0.002912975386167597,
 'train_92': 0.0016415543374026743,
 'train_361': 0.0018654113490241102,
 'train_400': 0.0018014551384661036,
 'train_738': 0.0018314049369446844,
 'train_1041': 0.0015940363907767942,
 'train_1385': 0.0014607363104155058,
 'train_1530': 0.0015808894416580851,
 'train_1672': 0.0012553717880571553,
 'train_1771': 0.0016231369258190688,
 'train_1852': 0.0018014551384661036,
 'train_1973': 0.0016138594980382814,
 'train_2113': 0.0015940363907767942,
 'train_2246': 0.001249883454013542,
 'train_2647': 0.0007917216945912231,
 'train_2723': 0.001438554508678193,
 'train_2759': 0.0018731993657202194,
 'train_2772': 0.0012594189756654516,
 'train_2796': 0.0012553717880571553,
 'train_3091': 0.0018731993657202194,
 'train_3251': 0.0016024435490738836,
 'train_3296': 0.0010846052889731043,
 'train_3393': 0.001467576109649473,
 'train_3742': 0.0012429677480490465,
 'train_3758': 0.001606751249836245,
 'train_3855': 0.0016729962792418325,
 'train_3966': 0.001580889

In [15]:
threshold = 0.7
similar_pairs = []
for i in range(len(cosine)):
    for j in range(i+1, len(cosine[i])):
        if cosine[i][j] > threshold:
            similar_pairs.append((i, j))

Finding the similarity pair

In [16]:
for pair in similar_pairs:
    print(f"Pair {pair}: Similarity Score: {cosine[pair]}")

Pair (9, 92): Similarity Score: 0.9121126181461727
Pair (9, 361): Similarity Score: 0.889696164792642
Pair (9, 397): Similarity Score: 0.7380212230491453
Pair (9, 400): Similarity Score: 0.8969435949152651
Pair (9, 738): Similarity Score: 0.886269127637651
Pair (9, 1041): Similarity Score: 0.8983702753385885
Pair (9, 1138): Similarity Score: 0.769643889072131
Pair (9, 1385): Similarity Score: 0.8537193728616552
Pair (9, 1530): Similarity Score: 0.826507517951787
Pair (9, 1672): Similarity Score: 0.8357742906097743
Pair (9, 1771): Similarity Score: 1.0
Pair (9, 1809): Similarity Score: 0.7739496847400632
Pair (9, 1852): Similarity Score: 0.8969435949152651
Pair (9, 1973): Similarity Score: 0.9116410246847257
Pair (9, 2113): Similarity Score: 0.8983702753385885
Pair (9, 2236): Similarity Score: 0.7811975826399749
Pair (9, 2246): Similarity Score: 0.832396374403504
Pair (9, 2646): Similarity Score: 0.7811975826399749
Pair (9, 2647): Similarity Score: 0.8074649923064137
Pair (9, 2663): Sim

Logistic regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [18]:
X_msr_train = df_train['#1 String'] + ' ' + df_train['#2 String']
y_msr_train = df_train['Quality']
X_msr_test = df['#1 String'] + ' ' + df['#2 String']
y_msr_test = df['Quality']

In [19]:
tfidf_vectorizer = TfidfVectorizer()
X_msr_train_tfidf = tfidf_vectorizer.fit_transform(X_msr_train)
X_msr_test_tfidf = tfidf_vectorizer.transform(X_msr_test)

In [20]:
logistic = LogisticRegression()
logistic.fit(X_msr_train_tfidf, y_msr_train)

In [21]:
y_pred_train = logistic.predict(X_msr_train_tfidf)
y_msr_pred_test = logistic.predict(X_msr_test_tfidf)

In [22]:
print("Training Set of collected text:")
print("Accuracy score:", accuracy_score(y_msr_train, y_pred_train))
print("Classification Report:")
print(classification_report(y_msr_train, y_pred_train))

print("\nTest Set of collected text:")
print("Accuracy score:", accuracy_score(y_msr_test, y_msr_pred_test))
print("Classification Report:")
print(classification_report(y_msr_test, y_msr_pred_test))

Training Set of collected text:
Accuracy score: 0.7612855740922473
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.29      0.44      1323
           1       0.74      0.99      0.85      2753

    accuracy                           0.76      4076
   macro avg       0.83      0.64      0.65      4076
weighted avg       0.80      0.76      0.72      4076


Test Set of collected text:
Accuracy score: 0.6846376811594203
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.14      0.23       578
           1       0.69      0.96      0.80      1147

    accuracy                           0.68      1725
   macro avg       0.66      0.55      0.52      1725
weighted avg       0.67      0.68      0.61      1725

