#Data Mount

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:

# Load dataset
path = '/content/drive/MyDrive/Colab Notebooks/terrorismdataset.csv'
df = pd.read_csv(path, encoding='cp1252')



#Preprocessing

In [None]:
# Preprocessing
df.fillna(-1, inplace=True)
df = df.loc[:, df.isin([' ', 'NULL', -1]).mean() < .6]
df = df.replace(-1, 'NaN', regex=True)
df.head(5)



Unnamed: 0,eventid,iyear,imonth,iday,extended,country,country_txt,region,region_txt,provstate,...,property,ishostkid,ransom,scite1,scite2,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
0,197000000001,1970,7,2,0,58,Dominican Republic,2,Central America & Caribbean,,...,0,0.0,0.0,,,PGIS,0,0,0,0
1,197000000002,1970,0,0,0,130,Mexico,1,North America,Federal,...,0,1.0,1.0,,,PGIS,0,1,1,1
2,197001000001,1970,1,0,0,160,Philippines,5,Southeast Asia,Tarlac,...,0,0.0,0.0,,,PGIS,-9,-9,1,1
3,197001000002,1970,1,0,0,78,Greece,8,Western Europe,Attica,...,1,0.0,0.0,,,PGIS,-9,-9,1,1
4,197001000003,1970,1,0,0,101,Japan,4,East Asia,Fukouka,...,1,0.0,0.0,,,PGIS,-9,-9,1,1


In [None]:
# Create documents
documents = []
for index, row in df.iterrows():
    column_strings = [f'{col_name}: {str(value)}' for col_name, value in row.items()]
    document = '\n'.join(column_strings)
    documents.append(document)

docs = []
for index, row in df.iterrows():
    column_values = [str(value) for value in row.values]
    doc = '\n'.join(column_values)
    docs.append(doc)



In [None]:
# Tokenization using NLTK
import nltk
nltk.download('punkt')
tokenized_documents = [nltk.word_tokenize(doc.lower()) for doc in docs]
d1 = [tuple(doc) for doc in tokenized_documents[:5000]]
d2 = [tuple(doc) for doc in docs[:5000]]



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Dynamic Reduction

In [None]:

from sklearn.decomposition import PCA, TruncatedSVD

# Ask the user to choose the dimensionality reduction method (PCA or SVD)
reduction_method = input("Choose the dimensionality reduction method (PCA or SVD): ").upper()

# Apply dimensionality reduction based on user's choice
def apply_dimensionality_reduction(X_tfidf, n_components=100, method='SVD'):
    if method == 'SVD':
        print("Using SVD for dimensionality reduction...")
        reducer = TruncatedSVD(n_components=n_components, random_state=42)
    elif method == 'PCA':
        print("Using PCA for dimensionality reduction...")
        reducer = PCA(n_components=n_components, random_state=42)
    else:
        raise ValueError("Invalid dimensionality reduction method. Choose 'PCA' or 'SVD'.")

    X_reduced = reducer.fit_transform(X_tfidf)
    return X_reduced, reducer

# Validate dimensionality reduction method input
if reduction_method not in ['PCA', 'SVD']:
    print(f"Invalid reduction method. Using default 'SVD'.")
    reduction_method = 'SVD'

# Apply TF-IDF vectorization to `docs`
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_tfidf = vectorizer.fit_transform(docs)

# Apply the dimensionality reduction method (SVD or PCA)
X_reduced, reducer_model = apply_dimensionality_reduction(X_tfidf, method=reduction_method)


# Display the results (top rows of reduced data)
import pandas as pd
pd.DataFrame(X_reduced).head()


Choose the dimensionality reduction method (PCA or SVD): pca
Using PCA for dimensionality reduction...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.234676,0.024922,-0.059466,-0.045901,-0.026396,0.00246,0.013815,0.004375,-0.001367,-0.027643,...,-0.018329,0.013471,0.007686,0.029663,-0.020292,-0.017085,0.022763,-0.021362,-0.02186,0.009665
1,0.168373,-0.000403,-0.066696,-0.018862,-0.044559,-0.01923,0.00045,0.005908,0.054996,-0.027825,...,0.05103,-0.049493,-0.012488,-0.012784,0.002471,-0.015193,0.008046,-0.03338,-0.040444,-0.004213
2,0.311033,0.03713,-0.110636,-0.019274,-0.062646,0.019698,0.046839,-0.021144,-0.015503,-0.011933,...,-0.059537,0.012679,0.036252,-0.030165,-0.041094,-0.025123,0.004143,0.011403,0.041754,-0.029808
3,0.313564,0.067764,-0.209717,-0.049678,-0.043022,0.067974,0.028518,-0.022403,-0.007501,-0.032947,...,-0.020483,-0.027245,-0.015961,0.002216,-0.013682,0.005256,0.02657,0.028665,-0.019995,-0.081443
4,0.361319,0.060528,-0.161732,-0.019361,-0.053653,0.031254,0.05218,-0.000608,-0.017638,-0.071134,...,0.024259,-0.043019,0.004958,0.038941,-0.035209,-0.002933,0.020307,0.024084,0.032951,-0.051013


# Dynamic and User Friendly Querying

In [None]:
# Suggested fix: Ensure proper dimensionality reduction for query and document vectors and verify similarity measure
# Updated function to include checks on the reducer model and query reduction
def recommend_documents(query, docs, X_reduced, vectorizer, reducer_model, top_k=5, measure='cosine'):
    # Preprocess the query to match the documents
    query = preprocess_query(query)

    # Transform the query into TF-IDF space
    query_tfidf = vectorizer.transform([query])

    # Apply the same dimensionality reduction used for the documents
    query_reduced = reducer_model.transform(query_tfidf)

    # Compute similarity using the chosen measure
    similarity_scores = compute_similarity(query_reduced, X_reduced, measure)

    # Get top-k most similar documents based on similarity scores
    top_k_indices = np.argsort(similarity_scores.flatten())[-top_k:][::-1]  # Sort and get top k

    # Retrieve the recommended documents based on top-k indices
    recommended_docs = [docs[i] for i in top_k_indices]

    return recommended_docs, similarity_scores.flatten()[top_k_indices]

In [None]:
# Function to compute similarity based on the chosen measure
from scipy.stats import pearsonr # import the missing module
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import jaccard_score as jaccard
from scipy.spatial.distance import cityblock
# Function to compute similarity based on the chosen measure
def compute_similarity(query_vector, doc_vectors, measure='cosine'):
    if measure == 'cosine':
        return cosine_similarity(query_vector, doc_vectors)  # 2D is fine
    elif measure == 'euclidean':
        return euclidean_distances(query_vector, doc_vectors)  # 2D is fine
    elif measure == 'manhattan':
        # Ensure vectors are 1-D before computing cityblock distance
        return np.array([cityblock(query_vector.flatten(), doc_vector.flatten()) for doc_vector in doc_vectors])
    elif measure == 'pearson':
        # Pearson correlation expects 1-D vectors
        return np.array([pearsonr(query_vector.flatten(), doc_vector.flatten())[0] for doc_vector in doc_vectors])
    else:
        raise ValueError(f"Unknown similarity measure: {measure}")


In [None]:
# Function to display the top-k recommended documents with column names
def display_recommended_documents_with_columns(recommended_docs, scores, df, similarity_measure, top_k=5):
    print(f"\nTop {top_k} documents for query with '{similarity_measure}' similarity:\n")
    for idx, (doc, score) in enumerate(zip(recommended_docs, scores)):
        print(f"\nDocument {idx + 1} (Similarity Score: {score:.4f}):")
        print(f"--- Document content with column names ---")
        # Rebuild the document with column names
        doc_index = df.index[idx]  # Get document index
        doc_with_columns = "\n".join([f"{col_name}: {str(df.iloc[doc_index][col_name])}" for col_name in df.columns])
        print(doc_with_columns[:500])  # Truncate to the first 500 characters for display
        print("------------------------------------------------------------")



In [None]:
def preprocess_query(query):
    # Lowercase and tokenize the query
    tokenized_query = nltk.word_tokenize(query.lower())
    return ' '.join(tokenized_query)  # Rebuild the query after tokenization for TF-IDF

In [None]:
# Ask the user for their query and similarity measure
query = input("Enter your search query (e.g., 'Afghanistan terrorism'): ")
similarity_measure = input("Choose the similarity measure (cosine, euclidean, jaccard, manhattan, pearson): ")



Enter your search query (e.g., 'Afghanistan terrorism'): 9/11 bombing
Choose the similarity measure (cosine, euclidean, jaccard, manhattan, pearson): cosine


In [None]:
# Validate the similarity measure input
valid_measures = ['cosine', 'euclidean', 'jaccard', 'manhattan', 'pearson']
similarity_measure = input("Choose the similarity measure (cosine, euclidean, jaccard, manhattan, pearson): ")
if similarity_measure not in valid_measures:
    print(f"Invalid similarity measure. Using default 'cosine'.")
    similarity_measure = 'cosine'



Choose the similarity measure (cosine, euclidean, jaccard, manhattan, pearson): cosine


In [None]:
# Run the recommendation based on the user's input
recommended_docs, scores = recommend_documents(query, docs, X_reduced, vectorizer, reducer_model, top_k=5, measure=similarity_measure)

# Display the recommended documents with the chosen similarity measure
display_recommended_documents_with_columns(recommended_docs, scores, df, similarity_measure)



Top 5 documents for query with 'cosine' similarity:


Document 1 (Similarity Score: 0.6294):
--- Document content with column names ---
eventid: 197000000001
iyear: 1970
imonth: 7
iday: 2
extended: 0
country: 58
country_txt: Dominican Republic
region: 2
region_txt: Central America & Caribbean
provstate: NaN
city: Santo Domingo
latitude: 18.456792
longitude: -69.951164
specificity: 1.0
vicinity: 0
summary: NaN
crit1: 1
crit2: 1
crit3: 1
doubtterr: 0.0
multiple: 0.0
success: 1
suicide: 0
attacktype1: 1
attacktype1_txt: Assassination
targtype1: 14
targtype1_txt: Private Citizens & Property
targsubtype1: 68.0
targsubtype1_txt: Named
------------------------------------------------------------

Document 2 (Similarity Score: 0.6032):
--- Document content with column names ---
eventid: 197000000002
iyear: 1970
imonth: 0
iday: 0
extended: 0
country: 130
country_txt: Mexico
region: 1
region_txt: North America
provstate: Federal
city: Mexico city
latitude: 19.371887
longitude: -99.086624
specif

#Most similar Documents in the dataset

In [None]:
# Define similarity function
def compute_similarity(query_vector, doc_vectors, measure='cosine'):
    if measure == 'cosine':
        return cosine_similarity(query_vector.reshape(1, -1), doc_vectors)
    elif measure == 'euclidean':
        return euclidean_distances(query_vector.reshape(1, -1), doc_vectors)
    elif measure == 'jaccard':
        return np.array([jaccard(query_vector, doc_vector) for doc_vector in doc_vectors])
    elif measure == 'manhattan':
        return np.array([cityblock(query_vector, doc_vector) for doc_vector in doc_vectors])
    elif measure == 'pearson':
        return np.array([pearsonr(query_vector, doc_vector)[0] for doc_vector in doc_vectors])
    else:
        raise ValueError(f'Unknown similarity measure: {measure}')



In [None]:
# Assuming a query document is available (for now use first document as query)
query_index = 0
query_vector = X_reduced[query_index]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial.distance import jaccard, cityblock
from scipy.stats import pearsonr

In [None]:
# Compute similarities for different measures
similarity_measures = ['cosine', 'euclidean', 'jaccard', 'manhattan', 'pearson']
similarity_results = {}
for measure in similarity_measures:
    similarity_results[measure] = compute_similarity(query_vector, X_reduced, measure)


In [None]:
# Evaluation based on top-k similar documents
def evaluate_similarity(similarity_scores):
    top_k = 5  # Get top 5 results
    top_docs = np.argsort(similarity_scores)[-top_k:]
    return top_docs


Similar Documents based on each Measure

In [None]:
# Run evaluation
evaluation_results = {measure: evaluate_similarity(similarity_results[measure]) for measure in similarity_measures}

# Display evaluation results
evaluation_results

{'cosine': array([[157679, 136010, 126699, ...,  63915,  57322,      0]]),
 'euclidean': array([[     0,  57322,  65075, ..., 115717, 112243, 141171]]),
 'jaccard': array([ 60566,  60567,  60568,  60592, 181690]),
 'manhattan': array([115717, 112975,  82045, 112243, 180312]),
 'pearson': array([65075, 65074, 63915, 57322,     0])}

In [None]:
# Function to display documents with column names included
def display_recommended_documents_with_columns(evaluation_results, docs, df, top_k=5):
    for measure, indices in evaluation_results.items():
        print(f"\nTop {top_k} documents for similarity measure: {measure}")
        # Ensure indices are flattened to scalar values
        top_docs_indices = indices.flatten()[:top_k]  # Flatten and get top-k documents
        for idx, doc_index in enumerate(top_docs_indices):
            print(f"\nDocument {idx + 1} (Index: {doc_index}):")
            print(f"--- Document content with column names ---")
            # Rebuild the document with column names
            doc_with_columns = "\n".join([f"{col_name}: {str(df.iloc[doc_index][col_name])}" for col_name in df.columns])
            print(doc_with_columns[:500])  # Truncate to the first 500 characters for display
            print("------------------------------------------------------------")

# Assuming `df` is the dataframe containing the original data, and `docs` contains the document strings
display_recommended_documents_with_columns(evaluation_results, docs, df)



Top 5 documents for similarity measure: cosine

Document 1 (Index: 157679):
--- Document content with column names ---
eventid: 201601140003
iyear: 2016
imonth: 1
iday: 14
extended: 0
country: 93
country_txt: Indonesia
region: 5
region_txt: Southeast Asia
provstate: Jakarta
city: Jakarta
latitude: -6.187549
longitude: 106.823085
specificity: 1.0
vicinity: 0
summary: 01/14/2016: Two assailants opened fire on a group of people gathered outside an Indonesian National Police (INP) post at Sarinah Mall in Jakarta city, Jakarta province, Indonesia. The crowd had gathered in response to an earlier attack on the post. A
------------------------------------------------------------

Document 2 (Index: 136010):
--- Document content with column names ---
eventid: 201408030105
iyear: 2014
imonth: 8
iday: 3
extended: 0
country: 102
country_txt: Jordan
region: 10
region_txt: Middle East & North Africa
provstate: Maan
city: Maan
latitude: 30.195645
longitude: 35.733423
specificity: 1.0
vicinity: 0
su

#Static Querying

In [None]:

# Recommendation function based on user query
def recommend_documents(query, docs, X_reduced, vectorizer, svd_model, top_k=5, measure='cosine'):
    # Transform query into TF-IDF and SVD-reduced space
    query_tfidf = vectorizer.transform([query])
    query_reduced = svd_model.transform(query_tfidf)

    # Compute similarity
    similarity_scores = compute_similarity(query_reduced, X_reduced, measure)

    # Get top-k most similar documents
    top_k_indices = np.argsort(similarity_scores.flatten())[-top_k:][::-1]  # Sort and get top k

    # Retrieve and display the top-k similar documents
    recommended_docs = [docs[i] for i in top_k_indices]
    return recommended_docs, similarity_scores.flatten()[top_k_indices]


In [None]:

# Example query: 'Afghanistan terrorism'

query = input('Give me a query you are looking for ')
recommended_docs, scores = recommend_documents(query, docs, X_reduced, vectorizer, svd, top_k=5, measure='cosine')


Give me a query you are looking for 9/11 bombings


NameError: name 'svd' is not defined

In [None]:
# Function to display documents with column names included
def display_recommended_documents_with_columns(evaluation_results, docs, df, top_k=5):
    for measure, indices in evaluation_results.items():
        print(f"\nTop {top_k} documents for similarity measure: {measure}")
        top_docs_indices = indices[:top_k]  # Get top-k documents
        for idx, doc_index in enumerate(top_docs_indices):
            print(f"\nDocument {idx + 1} (Index: {doc_index}):")
            print(f"--- Document content with column names ---")
            # Rebuild the document with column names
            doc_with_columns = "\n".join([f"{col_name}: {str(df.iloc[doc_index][col_name])}" for col_name in df.columns])
            print(doc_with_columns[:500])  # Truncate to the first 500 characters for display
            print("------------------------------------------------------------")

# Assuming `df` is the dataframe containing the original data, and `docs` contains the document strings
display_recommended_documents_with_columns(evaluation_results, docs, df)
