# Search Query Anamoly Detection
## 1. Data Import and Initial Analysis

In [2]:
import pandas as pd
data = pd.read_csv('Queries.csv')
null_values = data.isnull().sum()
column_info = data.info()
descriptive_stats = data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Top queries  1000 non-null   object 
 1   Clicks       1000 non-null   int64  
 2   Impressions  1000 non-null   int64  
 3   CTR          1000 non-null   object 
 4   Position     1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB


## 2. Convert CTR Column

In [3]:
data['CTR'] = data['CTR'].str.rstrip('%').astype(float) / 100.0


## 3. Analyze Common Words in Search Queries

In [4]:
from collections import Counter
import re
import plotly.express as px

def clean_and_split_queries(queries):
    queries = queries.str.lower()
    queries = queries.apply(lambda x: re.sub(r'[^\w\s]', '', x))
    words = queries.str.split()
    return words

data['words'] = clean_and_split_queries(data['Top queries'])
all_words = [word for query in data['words'] for word in query]
word_counts = Counter(all_words)

# Get the top 20 most common words
top_20_words = word_counts.most_common(20)
word_freq_df = pd.DataFrame(top_20_words, columns=['word', 'frequency'])

# Plotting the word frequencies
fig = px.bar(word_freq_df, x='word', y='frequency', title='Top 20 Most Common Words in Search Queries')
fig.show()


## 4.Top Queries by Clicks and Impressions

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Get top 10 queries by clicks and impressions
top_queries_clicks = data.sort_values(by='Clicks', ascending=False).head(10)
top_queries_impressions = data.sort_values(by='Impressions', ascending=False).head(10)

# Create subplots with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=("Top Queries by Clicks", "Top Queries by Impressions"))

# Plot for Clicks
fig.add_trace(
    go.Bar(x=top_queries_clicks['Top queries'], y=top_queries_clicks['Clicks'], name="Clicks"),
    row=1, col=1
)

# Plot for Impressions
fig.add_trace(
    go.Bar(x=top_queries_impressions['Top queries'], y=top_queries_impressions['Impressions'], name="Impressions"),
    row=1, col=2
)

# Update layout for better visualization
fig.update_layout(height=500, width=1000, showlegend=False)
fig.update_xaxes(title_text="Top queries", tickangle=-45)
fig.update_yaxes(title_text="Clicks", row=1, col=1)
fig.update_yaxes(title_text="Impressions", row=1, col=2)

# Show the plot
fig.show()


## 5. Analyze Queries with Highest and Lowest CTRs

In [19]:
import plotly.graph_objects as go

highest_ctr_queries = data.sort_values(by='CTR', ascending=False).head(10)

fig_top = go.Figure()

fig_top.add_trace(
    go.Bar(x=highest_ctr_queries['Top queries'], y=highest_ctr_queries['CTR'], marker=dict(color='blue', line=dict(width=1)))
)

fig_top.update_layout(
    title="Top Queries by CTR",
    height=700,
    width=900,
    xaxis_title="Top queries",
    yaxis_title="CTR",
    xaxis_tickangle=-45,
    showlegend=False
)

fig_top.show()


In [18]:
import plotly.graph_objects as go

lowest_ctr_queries = data.sort_values(by='CTR', ascending=True).head(10)

fig_bottom = go.Figure()

fig_bottom.add_trace(
    go.Bar(x=lowest_ctr_queries['Top queries'], y=lowest_ctr_queries['CTR'])
)

fig_bottom.update_layout(
    title="Bottom Queries by CTR",
    height=500, width=900,
    xaxis_title="Top queries",
    yaxis_title="CTR",
    xaxis_tickangle=-45,
    showlegend=False
)

fig_bottom.show()


## 6. Correlation Between Metrics

In [20]:
numeric_data = data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_data.corr()
fig = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix')
fig.show()


## 7. Anomaly Detection Using Isolation Forest

In [24]:
from sklearn.ensemble import IsolationForest
import pandas as pd

features = ['Clicks', 'Impressions', 'CTR', 'Position']

model = IsolationForest(contamination=0.05, random_state=42)
data['anomaly'] = model.fit_predict(data[features])

anomalies = data[data['anomaly'] == -1]

anomalies_display = anomalies.drop(columns=['anomaly'])

display_df = anomalies_display[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']].reset_index(drop=True)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
print(display_df)



                    Top queries                   Clicks  Impressions    CTR   Position
0                    number guessing game python   5223      14578     0.3583     1.61 
1                            thecleverprogrammer   2809       3456     0.8128     1.02 
2               python projects with source code   2077      73380     0.0283     5.94 
3      classification report in machine learning   2012       4959     0.4057     1.28 
4                          the clever programmer   1931       2528     0.7638     1.09 
5            standard scaler in machine learning   1559       7292     0.2138     1.53 
6                                   aman kharwal   1490       5752     0.2590     3.75 
7                    python turtle graphics code   1455      13585     0.1071     4.60 
8          python game projects with source code   1421       4946     0.2873     2.23 
9            82 python projects with source code   1343       3562     0.3770     1.32 
10                       guess t