In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/queries/Queries.csv


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

In [3]:
df=pd.read_csv('/kaggle/input/queries/Queries.csv')

In [4]:
df

Unnamed: 0,Top queries,Clicks,Impressions,CTR,Position
0,number guessing game python,5223,14578,35.83%,1.61
1,thecleverprogrammer,2809,3456,81.28%,1.02
2,python projects with source code,2077,73380,2.83%,5.94
3,classification report in machine learning,2012,4959,40.57%,1.28
4,the clever programmer,1931,2528,76.38%,1.09
...,...,...,...,...,...
995,human activity recognition python code,48,369,13.01%,6.53
996,python contact book project,48,334,14.37%,4.92
997,why standard scaler is used,48,295,16.27%,2.07
998,credit scoring machine learning python,48,243,19.75%,6.40


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Top queries  1000 non-null   object 
 1   Clicks       1000 non-null   int64  
 2   Impressions  1000 non-null   int64  
 3   CTR          1000 non-null   object 
 4   Position     1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


In [6]:
#cleaning the CTR column
df['CTR']=df['CTR'].str.rstrip('%').astype('float')/100


In [7]:
# Function to clean and split the queries into words
def clean_and_split(query):
    words = re.findall(r'\b[a-zA-Z]+\b', query.lower())
    return words

# Split each query into words and count the frequency of each word
word_counts = Counter()
for query in df['Top queries']:
    word_counts.update(clean_and_split(query))

word_freq_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

# Plotting the word frequencies
fig = px.bar(word_freq_df, x='Word', y='Frequency', title='Top 20 Most Common Words in Search Queries')
fig.show()

In [8]:
# Top queries by Clicks and Impressions
top_queries_clicks_vis = df.nlargest(10, 'Clicks')



In [9]:
top_queries_impressions_vis = df.nlargest(10, 'Impressions')[['Top queries', 'Impressions']]

In [10]:
# Plotting
fig_clicks = px.bar(top_queries_clicks_vis, x='Top queries', y='Clicks', title='Top Queries by Clicks')
fig_impressions = px.bar(top_queries_impressions_vis, x='Top queries', y='Impressions', title='Top Queries by Impressions')
fig_clicks.show()
fig_impressions.show()

In [11]:
# Queries with highest and lowest CTR
top_ctr_vis = df.nlargest(10, 'CTR')[['Top queries', 'CTR']]
bottom_ctr_vis =df.nsmallest(10, 'CTR')[['Top queries', 'CTR']]

# Plotting
fig_top_ctr = px.bar(top_ctr_vis, x='Top queries', y='CTR', title='Top Queries by CTR')
fig_bottom_ctr = px.bar(bottom_ctr_vis, x='Top queries', y='CTR', title='Bottom Queries by CTR')
fig_top_ctr.show()
fig_bottom_ctr.show()

In [12]:
# Correlation matrix visualization
correlation_matrix = df[['Clicks', 'Impressions', 'CTR', 'Position']].corr()
fig_corr = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix')
fig_corr.show()

1.****Using Isolution forest anomaly detection****

In [13]:
from sklearn.ensemble import IsolationForest

# Selecting relevant features
features = df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Initializing Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.01)  # contamination is the expected proportion of outliers

# Fitting the model
iso_forest.fit(features)

# Predicting anomalies
df['anomaly'] = iso_forest.predict(features)

# Filtering out the anomalies
anomalies = df[df['anomaly'] == -1]


X does not have valid feature names, but IsolationForest was fitted with feature names



using DBSCAN

In [14]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [15]:
features = df[['Clicks', 'Impressions', 'CTR', 'Position']]
scaler=StandardScaler()

scaler_features=scaler.fit_transform(features)

dbscan= DBSCAN(eps=0.5,min_samples=5)
df['cluster'] = dbscan.fit_predict(scaler_features)

In [16]:
anomalies = df[df['cluster'] == -1]
print(anomalies[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']])

                                    Top queries  Clicks  Impressions     CTR  \
0                   number guessing game python    5223        14578  0.3583   
1                           thecleverprogrammer    2809         3456  0.8128   
2              python projects with source code    2077        73380  0.0283   
3     classification report in machine learning    2012         4959  0.4057   
4                         the clever programmer    1931         2528  0.7638   
..                                          ...     ...          ...     ...   
664              online payment fraud detection      73        10368  0.0070   
671  machine learning projects with source code      72         2249  0.0320   
684                             turtle graphics      71        10096  0.0070   
858                      water quality analysis      56         7359  0.0076   
929                               python turtle      52        18228  0.0029   

     Position  
0        1.61  
1      

In [17]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

# Selecting relevant features
features = df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Standardizing the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Initializing Local Outlier Factor
lof = LocalOutlierFactor(contamination=0.01)  # contamination is the expected proportion of outliers

# Fitting the model
df['anomaly_score'] = lof.fit_predict(scaled_features)

# Identifying anomalies (outliers will have a score of -1)
anomalies = df[df['anomaly_score'] == -1]


In [18]:
print(anomalies[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']])

                           Top queries  Clicks  Impressions     CTR  Position
0          number guessing game python    5223        14578  0.3583      1.61
1                  thecleverprogrammer    2809         3456  0.8128      1.02
2     python projects with source code    2077        73380  0.0283      5.94
167                text to handwriting     222        11283  0.0197     28.52
381       data science research topics     118         1718  0.0687     23.60
503  book recommendation system python      93          265  0.3509      6.45
664     online payment fraud detection      73        10368  0.0070     16.88
858             water quality analysis      56         7359  0.0076     27.56
927          the clever programmer.com      53           64  0.8281      1.00
928           the cleverprogrammer.com      53           62  0.8548      1.00
