In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
import numpy as np
from numpy.linalg import norm

In [4]:
def text_similarity(text1, text2):
    # Tokenize and lemmatize the texts
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)
    lemmatizer = WordNetLemmatizer()
    tokens1 = [lemmatizer.lemmatize(token) for token in tokens1]
    tokens2 = [lemmatizer.lemmatize(token) for token in tokens2]

    # Remove stopwords
    stop_words = stopwords.words('english')
    tokens1 = [token for token in tokens1 if token not in stop_words]
    tokens2 = [token for token in tokens2 if token not in stop_words]

    # Create the TF-IDF vectors
    vectorizer = TfidfVectorizer()
    vector1 = vectorizer.fit_transform(tokens1)
    print(vector1.shape)
    vector2 = vectorizer.transform(tokens2)
    print(vector2.shape)
    # Calculate the cosine similarity
    #similarity = cosine_similarity(vector1, vector2)
    similarity = np.dot(vector1, vector2)/(norm(vector1)*norm(vector2))
    #print(similarity)

    return similarity

In [5]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import csv
with open('Judgement_4_csvfile.csv') as file_obj:
      
    # Create reader object by passing the file
    # object to DictReader method
    reader_obj = csv.DictReader(file_obj)
      
    # Iterate over each row in the csv file
    # using reader object
    for row in reader_obj:
        print(row)

In [13]:
issue = 'The heart of the matter would be whether the distinction made in Radheshyam Kejriwal (supra) applies to the factual score to the case at hand.'
FAC1 = 'The present appeals, by special leave, are directed against the order dated 14.10.2013 passed by the High Court of Judicature at Signature Not Verified Bombay in Criminal Application No.497 of 2011 assailing the order Digitally signed by CHETAN KUMAR Date: 2016.06.01 16:56:12 IST'
FAC2 = 'Being grieved by the order passed by the adjudicating authority, the company as well as the Directors preferred Appeal No.517 of 2005 and other connected appeals before the Appellate Tribunal for Foreign Exchange (for short, the tribunal).'
FAC3 = 'From this sequence, it further flows that Section 18(2) is not applicable to the goods which were sold in international market by way of international transactions because these provisions are made applicable to the goods which are otherwise covered under Section 18(1)(a) and not otherwise.'
FAC4 = 'As the goods in question were never exported outside India so Section 18(2) is in no way can be applied to these transactions because such international selling is not governed by Section 18(1)(a) of FER Act.'
FAC5 =  'The impugned order has repeatedly said that for purchase of CPT colour tubes from Japan and Korea the appellant spent the foreign exchange.'
FAC6 = 'May it be so.'
FAC7 = 'But such spending of foreign exchange in international trade by an Indian person is not forbidden by Section 19 of FER Act.'
FAC8 = 'If that is so, the appellant cannot be held guilty for Section 18(2) read with Section 18(3) of FER Act, 1973.'


In [14]:
corpus = [issue, FAC1, FAC2, FAC3, FAC4, FAC5, FAC6, FAC7, FAC8]

In [None]:
print(corpus)

['The question, therefore, that arises in the present appeals is the entitlement of the appellant – University – Assessee to exemption from payment of tax under the provisions of Section 10(23C)(iiiab) of the Act which is in the following terms: 10. Incomes not included in total income.', 'Leave granted.', 'The appellant – University, namely, Visvesvraya Technological University (VTU) has been constituted under the Visveswaraiah Technological University Act, 1994 (for short “VTU Act”).', 'It discharges functions earlier performed by the Department of Technical Education, Government of Karnataka.', 'The University exercises control over all Government and Private Engineering Colleges within Karnataka.', 'For the Assessment Years 2004-2005 to 2009-2010 notices under Section 148 of the Income Tax Act, 1961 (for short “the Act”) were issued to the appellant – University – Assessee.', 'Eventually returns were filed for the Assessment Years in question declaring "Nil" income and claiming exe

In [15]:
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)



In [16]:
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.17918635 0.22728647 0.17255114 0.13993058 0.17161196
  0.05367329 0.03727602 0.0899831 ]
 [0.17918635 1.         0.25499513 0.12579898 0.11174336 0.14298999
  0.         0.13303284 0.04659036]
 [0.22728647 0.25499513 1.         0.09612531 0.12960494 0.28114853
  0.         0.11526254 0.10540278]
 [0.17255114 0.12579898 0.09612531 1.         0.4128139  0.11555533
  0.0646909  0.17374412 0.20419986]
 [0.13993058 0.11174336 0.12960494 0.4128139  1.         0.03867394
  0.11433386 0.30775202 0.35222614]
 [0.17161196 0.14298999 0.28114853 0.11555533 0.03867394 1.
  0.         0.08772884 0.15832267]
 [0.05367329 0.         0.         0.0646909  0.11433386 0.
  1.         0.         0.14921031]
 [0.03727602 0.13303284 0.11526254 0.17374412 0.30775202 0.08772884
  0.         1.         0.17675754]
 [0.0899831  0.04659036 0.10540278 0.20419986 0.35222614 0.15832267
  0.14921031 0.17675754 1.        ]]


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
count_vectorizer = CountVectorizer()
vector_matrix = count_vectorizer.fit_transform(corpus)
vector_matrix

<9x147 sparse matrix of type '<class 'numpy.int64'>'
	with 222 stored elements in Compressed Sparse Row format>

In [82]:
import pandas as pd

def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=tokens, columns=tokens)
    
    return(df)

In [83]:
cosine_similarity_matrix = cosine_similarity(vector_matrix)
create_dataframe(cosine_similarity_matrix,['issue','FAC1', 'FAC2', 'FAC3', 'FAC4', 'FAC5', 'FAC6', 'FAC7', 'FAC8'])

Unnamed: 0,issue,FAC1,FAC2,FAC3,FAC4,FAC5,FAC6,FAC7,FAC8
issue,1.0,0.412568,0.518688,0.302944,0.24189,0.433383,0.072932,0.085819,0.204211
FAC1,0.412568,1.0,0.500183,0.259161,0.230967,0.35015,0.0,0.27735,0.141421
FAC2,0.518688,0.500183,1.0,0.224221,0.207514,0.532518,0.0,0.179969,0.206474
FAC3,0.302944,0.259161,0.224221,1.0,0.534114,0.226863,0.061085,0.287513,0.317641
FAC4,0.24189,0.230967,0.207514,0.534114,1.0,0.111979,0.150756,0.443484,0.482418
FAC5,0.433383,0.35015,0.532518,0.226863,0.111979,1.0,0.0,0.145671,0.259973
FAC6,0.072932,0.0,0.0,0.061085,0.150756,0.0,1.0,0.0,0.2
FAC7,0.085819,0.27735,0.179969,0.287513,0.443484,0.145671,0.0,1.0,0.274563
FAC8,0.204211,0.141421,0.206474,0.317641,0.482418,0.259973,0.2,0.274563,1.0


In [37]:
updated_corpus = FAC2+'.' + FAC5+'.' + FAC1+'.' + FAC3+'.' + FAC4+'.' + FAC8+'.'

In [38]:
print(updated_corpus)

Being grieved by the order passed by the adjudicating authority, the company as well as the Directors preferred Appeal No.517 of 2005 and other connected appeals before the Appellate Tribunal for Foreign Exchange (for short, the tribunal)..The impugned order has repeatedly said that for purchase of CPT colour tubes from Japan and Korea the appellant spent the foreign exchange..The present appeals, by special leave, are directed against the order dated 14.10.2013 passed by the High Court of Judicature at Signature Not Verified Bombay in Criminal Application No.497 of 2011 assailing the order Digitally signed by CHETAN KUMAR Date: 2016.06.01 16:56:12 IST.From this sequence, it further flows that Section 18(2) is not applicable to the goods which were sold in international market by way of international transactions because these provisions are made applicable to the goods which are otherwise covered under Section 18(1)(a) and not otherwise..As the goods in question were never exported ou

In [29]:
print(cosine_similarity_matrix[0][1])

0.41256849850351746


In [22]:
import matplotlib.pyplot as plt
import numpy as np

In [30]:
plt.plot(cosine_similarity_matrix[0][0], cosine_similarity_matrix[0][1], cosine_similarity_matrix[0][2], cosine_similarity_matrix[0][3, cosine_similarity_matrix[0][4], cosine_similarity_matrix[0][5], cosine_similarity_matrix[0][6], cosine_similarity_matrix[0][7], cosine_similarity_matrix[0][8]])
plt.title('GRID REPRESENTATION')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices